diff --git a/.gitignore b/.gitignore
index 2c3747f..3013a7b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@ copy-qmds.R
 crossref.sh
 cover.png
 /.quarto/
+*.tex
+*.pdf
\ No newline at end of file
diff --git a/_quarto.yml b/_quarto.yml
index 06699ba..086101e 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -43,6 +43,7 @@ book:
         - inference/clt.qmd
         - inference/confidence-intervals.qmd
         - inference/hypothesis-testing.qmd
+        - inference/bootstrap.qmd
         - inference/models.qmd
         - inference/bayes.qmd
         - inference/hierarchical-models.qmd
@@ -83,6 +84,18 @@ format:
     code-link: true
     author-meta: Rafael A. Irizarry
     callout-appearance: simple
+  pdf: 
+    documentclass: krantz
+    classoption: [krantz2,10pt,twoside,onecolumn,final,openright]
+    include-in-header: preamble.tex
+    header-includes: |
+     \usepackage{amssymb}
+     \usepackage{amsmath}
+     \usepackage{graphicx}
+     \usepackage{subfigure}
+     \usepackage{makeidx}
+     \usepackage{multicol}
+    keep-tex: true
 
 knitr:
   opts_chunk:
diff --git a/docs/highdim/dimension-reduction.html b/docs/highdim/dimension-reduction.html
index 980c783..419a92a 100644
--- a/docs/highdim/dimension-reduction.html
+++ b/docs/highdim/dimension-reduction.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 21&nbsp; Dimension reduction</title>
+<title>Advanced Data Science - 22&nbsp; Dimension reduction</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../highdim/intro-highdim.html">High dimensional data</a></li><li class="breadcrumb-item"><a href="../highdim/dimension-reduction.html"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../highdim/intro-highdim.html">High dimensional data</a></li><li class="breadcrumb-item"><a href="../highdim/dimension-reduction.html"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,25 +405,25 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#motivation-preserving-distance" id="toc-motivation-preserving-distance" class="nav-link active" data-scroll-target="#motivation-preserving-distance"><span class="header-section-number">21.1</span> Motivation: preserving distance</a></li>
-  <li><a href="#rotations" id="toc-rotations" class="nav-link" data-scroll-target="#rotations"><span class="header-section-number">21.2</span> Rotations</a></li>
-  <li><a href="#linear-transformations" id="toc-linear-transformations" class="nav-link" data-scroll-target="#linear-transformations"><span class="header-section-number">21.3</span> Linear transformations</a></li>
-  <li><a href="#orthogonal-transformations" id="toc-orthogonal-transformations" class="nav-link" data-scroll-target="#orthogonal-transformations"><span class="header-section-number">21.4</span> Orthogonal transformations</a></li>
-  <li><a href="#sec-pca" id="toc-sec-pca" class="nav-link" data-scroll-target="#sec-pca"><span class="header-section-number">21.5</span> Principal Component Analysis (PCA)</a></li>
+<li><a href="#motivation-preserving-distance" id="toc-motivation-preserving-distance" class="nav-link active" data-scroll-target="#motivation-preserving-distance"><span class="header-section-number">22.1</span> Motivation: preserving distance</a></li>
+  <li><a href="#rotations" id="toc-rotations" class="nav-link" data-scroll-target="#rotations"><span class="header-section-number">22.2</span> Rotations</a></li>
+  <li><a href="#linear-transformations" id="toc-linear-transformations" class="nav-link" data-scroll-target="#linear-transformations"><span class="header-section-number">22.3</span> Linear transformations</a></li>
+  <li><a href="#orthogonal-transformations" id="toc-orthogonal-transformations" class="nav-link" data-scroll-target="#orthogonal-transformations"><span class="header-section-number">22.4</span> Orthogonal transformations</a></li>
+  <li><a href="#sec-pca" id="toc-sec-pca" class="nav-link" data-scroll-target="#sec-pca"><span class="header-section-number">22.5</span> Principal Component Analysis (PCA)</a></li>
   <li>
-<a href="#examples" id="toc-examples" class="nav-link" data-scroll-target="#examples"><span class="header-section-number">21.6</span> Examples</a>
+<a href="#examples" id="toc-examples" class="nav-link" data-scroll-target="#examples"><span class="header-section-number">22.6</span> Examples</a>
   <ul class="collapse">
-<li><a href="#iris-example" id="toc-iris-example" class="nav-link" data-scroll-target="#iris-example"><span class="header-section-number">21.6.1</span> Iris example</a></li>
-  <li><a href="#mnist-example" id="toc-mnist-example" class="nav-link" data-scroll-target="#mnist-example"><span class="header-section-number">21.6.2</span> MNIST example</a></li>
+<li><a href="#iris-example" id="toc-iris-example" class="nav-link" data-scroll-target="#iris-example"><span class="header-section-number">22.6.1</span> Iris example</a></li>
+  <li><a href="#mnist-example" id="toc-mnist-example" class="nav-link" data-scroll-target="#mnist-example"><span class="header-section-number">22.6.2</span> MNIST example</a></li>
   </ul>
 </li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">21.7</span> Exercises</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">22.7</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/highdim/dimension-reduction.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
 <h1 class="title">
-<span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span>
+<span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span>
 </h1>
 </div>
 
@@ -431,10 +437,10 @@ <h1 class="title">
   </div>
   
 
-</header><p>A typical machine learning task involves working with a large number of predictors, which makes visualization somewhat challenging. We have shown methods for visualizing univariate and paired data, but plots that reveal relationships between many variables are more complicated in higher dimensions. For example, to compare each of the 784 features in our predicting digits example, we would have to create, for example, 306,936 scatterplots. Creating one single scatter-plot of the data is impossible due to the high dimensionality.</p>
-<p>Here we describe powerful techniques useful for exploratory data analysis, among other things, generally referred to as <em>dimension reduction</em>. The general idea is to reduce the dimension of the dataset while preserving important characteristics, such as the distance between features or observations. With fewer dimensions, visualization then becomes more feasible. The general technique behind it all, the singular value decomposition, is also useful in other contexts. Principal component analysis (PCA) is the approach we will be showing. We will motivate the ideas with a simple illustrative example and present some mathematical concepts needed to understand PCA. We finish the chapter with demonstrating the use of PCA in two more complex datasets.</p>
-<section id="motivation-preserving-distance" class="level2" data-number="21.1"><h2 data-number="21.1" class="anchored" data-anchor-id="motivation-preserving-distance">
-<span class="header-section-number">21.1</span> Motivation: preserving distance</h2>
+</header><p>A typical machine learning task involves working with a large number of predictors, which makes visualization somewhat challenging. We have shown methods for visualizing univariate and paired data, but plots that reveal relationships between many variables are more complicated in higher dimensions. For example, to compare each of the 784 features in our predicting digits example, we would have to create 306,936 scatterplots. Creating one single scatterplot of the data is impossible due to the high dimensionality.</p>
+<p>Here we describe powerful techniques useful for exploratory data analysis, among other things, generally referred to as <em>dimension reduction</em>. The general idea is to reduce the dimension of the dataset while preserving important characteristics, such as the distance between features or observations. With fewer dimensions, visualization then becomes more feasible. The general technique behind it all, the singular value decomposition, is also useful in other contexts. Principal component analysis (PCA) is the approach we will be showing. We will motivate the ideas with a simple illustrative example and present some mathematical concepts needed to understand PCA. We finish the chapter by demonstrating the use of PCA in two more complex datasets.</p>
+<section id="motivation-preserving-distance" class="level2" data-number="22.1"><h2 data-number="22.1" class="anchored" data-anchor-id="motivation-preserving-distance">
+<span class="header-section-number">22.1</span> Motivation: preserving distance</h2>
 <p>We consider an example with twin heights. Some pairs are adults, the others are children. Here we simulate 100 two-dimensional points that represent the number of standard deviations each individual is from the mean height. Each point is a pair of twins. We use the <code>mvrnorm</code> function from the <strong>MASS</strong> package to simulate bivariate normal data.</p>
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-1_f5e2ed90b9ecd4dcecc43379a25c8e5d">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1983</span><span class="op">)</span></span>
@@ -455,18 +461,13 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Our features are <span class="math inline">\(n\)</span> two-dimensional points, the two heights. For illustrative purposes, we will act as if visualizing two dimensions is too challenging and we want to explore the data through a histogram of a one-dimensional variable. We therefore want to reduce the dimensions from two to one, but still be able to understand important characteristics of the data, for example that the observations cluster into two groups: adults and children. To show the ideas presented here are generally useful, we will standardize the data so that observations are in standard units rather than inches:</p>
+<p>Our features are <span class="math inline">\(n\)</span> two-dimensional points, the two heights. For illustrative purposes, we will pretend that visualizing two dimensions is too challenging and we want to explore the data through a histogram of a one-dimensional variable. We therefore want to reduce the dimensions from two to one, but still be able to understand important characteristics of the data, in particular that the observations cluster into two groups: adults and children. To show the ideas presented here are generally useful, we will standardize the data so that observations are in standard units rather than inches:</p>
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-2_0097cdf474582f4f71a70398a603ef1e">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/HenrikBengtsson/matrixStats">matrixStats</a></span><span class="op">)</span></span>
-<span><span class="co">#&gt; </span></span>
-<span><span class="co">#&gt; Attaching package: 'matrixStats'</span></span>
-<span><span class="co">#&gt; The following object is masked from 'package:dplyr':</span></span>
-<span><span class="co">#&gt; </span></span>
-<span><span class="co">#&gt;     count</span></span>
 <span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">2</span>, <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colMeans</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">2</span>, <span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/rowSds.html">colSds</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, <span class="st">"/"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>In the figure above we show the distance between observation 1 and 2 (blue), and observation 1 and 51 (red). Note that the blue line is shorter, which implies 1 and 2 are closer.</p>
+<p>In the figure above, we show the distance between observation 1 and 2 (blue), and observation 1 and 51 (red). Note that the blue line is shorter, which implies that 1 and 2 are closer.</p>
 <p>We can compute these distances using <code>dist</code>:</p>
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-3_0de0a9af85006905a45522575741d2cb">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">d</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
@@ -480,11 +481,11 @@ <h1 class="title">
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-4_e4377268eb7f7d2da24f805c83f48df5">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">z</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[</span>,<span class="fl">1</span><span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To make the distances comparable, we divide the sum of squares by the number of dimensions being added. So for the two dimensional case we have</p>
+<p>To make the distances comparable, we divide the sum of squares by the number of dimensions being added. So for the two dimensional case, we have:</p>
 <p><span class="math display">\[
 \sqrt{ \frac{1}{2} \sum_{j=1}^2 (x_{1,j}-x_{2,j})^2 },
 \]</span></p>
-<p>so to make the distances comparable we divide by <span class="math inline">\(\sqrt{2}\)</span>:</p>
+<p>To make the distances comparable, we divide by <span class="math inline">\(\sqrt{2}\)</span>:</p>
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-5_6173f1d82b626e9dadb0900f33ddbe51">
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">/</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">2</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="va">z</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/abline.html">abline</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span>, col <span class="op">=</span> <span class="st">"red"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -497,16 +498,16 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>This one number summary does ok at preserving distances, but, can we pick a one-dimensional summary that makes the approximation even better?</p>
-<p>If we look back at the scatterplot and visualize a line between any pair of points, the length of this line is the distance between the two points. These lines tend to go along the direction of the diagonal. We will learn that we can <em>rotate</em> the points in a way that preserve the distance between points, while increasing the variability in one dimension and reducing it on the other. By doing this, we keep more of the <em>information</em> about distances in the first dimension. In the next section we describe a mathematical approach that permits us to find rotations that preserve distance between points. We can then find the rotation that maximizes the variability in the first dimension.</p>
-</section><section id="rotations" class="level2" data-number="21.2"><h2 data-number="21.2" class="anchored" data-anchor-id="rotations">
-<span class="header-section-number">21.2</span> Rotations</h2>
+<p>This one number summary does ok at preserving distances, but, can we pick a one-dimensional summary that improves the approximation?</p>
+<p>If we look back at the scatterplot and visualize a line between any pair of points, the length of this line is the distance between the two points. These lines tend to go along the direction of the diagonal. We will learn that we can <em>rotate</em> the points in a way that preserve the distance between points, while increasing the variability in one dimension and reducing it on the other. Using this method, we keep more of the <em>information</em> about distances in the first dimension. In the next section, we describe a mathematical approach that permits us to find rotations that preserve distance between points. We can then find the rotation that maximizes the variability in the first dimension.</p>
+</section><section id="rotations" class="level2" data-number="22.2"><h2 data-number="22.2" class="anchored" data-anchor-id="rotations">
+<span class="header-section-number">22.2</span> Rotations</h2>
 <p>Any two-dimensional point <span class="math inline">\((x_1, x_2)^\top\)</span> can be written as the base and height of a triangle with a hypotenuse going from <span class="math inline">\((0,0)^\top\)</span> to <span class="math inline">\((x_1, x_2)^\top\)</span>:</p>
 <p><span class="math display">\[
 x_1 = r \cos\phi, \,\, x_2 = r \sin\phi
 \]</span></p>
-<p>with <span class="math inline">\(r\)</span> the length of the hypotenuse and <span class="math inline">\(\phi\)</span> the angel between the hypotenuse and the x-axis.</p>
-<p>We can <em>rotate</em> the point <span class="math inline">\((x_1, x_2)^\top\)</span> around a circle with center <span class="math inline">\((0,0)^\top\)</span> and radius <span class="math inline">\(r\)</span> by an angle <span class="math inline">\(\theta\)</span> by changing the angle in the previous equation to <span class="math inline">\(\phi + \theta\)</span>:</p>
+<p>with <span class="math inline">\(r\)</span> the length of the hypotenuse and <span class="math inline">\(\phi\)</span> the angle between the hypotenuse and the x-axis.</p>
+<p>To <em>rotate</em> the point <span class="math inline">\((x_1, x_2)^\top\)</span> around a circle with center <span class="math inline">\((0,0)^\top\)</span> and radius <span class="math inline">\(r\)</span> by an angle <span class="math inline">\(\theta\)</span> we simply change the angle in the previous equation to <span class="math inline">\(\phi + \theta\)</span>:</p>
 <p><span class="math display">\[
 z_1 = r \cos(\phi+ \theta), \,\,
 z_2 = r \sin(\phi + \theta)
@@ -519,15 +520,15 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>We can use trigonometric identities to rewrite <span class="math inline">\((z_1, z_2)\)</span> in the following way:</p>
+<p>We can use trigonometric identities to rewrite <span class="math inline">\((z_1, z_2)\)</span> as follows:</p>
 <p><span class="math display">\[
-\begin{align}
+\begin{aligned}
 z_1 = r \cos(\phi + \theta) = r \cos \phi \cos\theta -  r \sin\phi \sin\theta =  x_1 \cos(\theta) -  x_2 \sin(\theta)\\
 z_2 = r \sin(\phi + \theta) =  r \cos\phi \sin\theta + r \sin\phi \cos\theta =  x_1 \sin(\theta) + x_2 \cos(\theta)
-\end{align}
+\end{aligned}
 \]</span></p>
 <p>Now we can rotate each point in the dataset by simply applying the formula above to each pair <span class="math inline">\((x_{i,1}, x_{i,2})^\top\)</span>. Here is what the twin standardized heights look like after rotating each point by <span class="math inline">\(-45\)</span> degrees:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-7_0a4564b641397bac3babdcda89ca58c4">
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-7_ec3922bea5a5849b039c849692cbf216">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="dimension-reduction_files/figure-html/unnamed-chunk-7-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -535,11 +536,13 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Note that while the variability of <span class="math inline">\(x_1\)</span> and <span class="math inline">\(x_2\)</span> are similar, the variability of <span class="math inline">\(z_1\)</span> is much larger than the variability of <span class="math inline">\(z_2\)</span>. Also note that the distances between points appear to be preserved. In the next sections, we show, mathematically, that this in fact the case.</p>
-</section><section id="linear-transformations" class="level2" data-number="21.3"><h2 data-number="21.3" class="anchored" data-anchor-id="linear-transformations">
-<span class="header-section-number">21.3</span> Linear transformations</h2>
-<p>Any time a matrix <span class="math inline">\(\mathbf{X}\)</span> is multiplied by another matrix <span class="math inline">\(\mathbf{A}\)</span>, we refer to the product <span class="math inline">\(\mathbf{Z} = \mathbf{X}\mathbf{A}\)</span> as a linear transformation of <span class="math inline">\(\mathbf{X}\)</span>. Below we show that the rotations described above are a linear transformation. To see this, note that for any row <span class="math inline">\(i\)</span>, the first entry was:</p>
-<p><span class="math display">\[z_{i,1} = a_{1,1} x_{i,1} + a_{2,1} x_{i,2}\]</span></p>
+<p>Note that while the variability of <span class="math inline">\(x_1\)</span> and <span class="math inline">\(x_2\)</span> are similar, the variability of <span class="math inline">\(z_1\)</span> is much larger than the variability of <span class="math inline">\(z_2\)</span>. Also, notice that the distances between points appear to be preserved. In the next sections, we show mathematically that this in fact the case.</p>
+</section><section id="linear-transformations" class="level2" data-number="22.3"><h2 data-number="22.3" class="anchored" data-anchor-id="linear-transformations">
+<span class="header-section-number">22.3</span> Linear transformations</h2>
+<p>Any time a matrix <span class="math inline">\(\mathbf{X}\)</span> is multiplied by another matrix <span class="math inline">\(\mathbf{A}\)</span>, we refer to the product <span class="math inline">\(\mathbf{Z} = \mathbf{X}\mathbf{A}\)</span> as a linear transformation of <span class="math inline">\(\mathbf{X}\)</span>. Below, we show that the rotations described above are a linear transformation. To see this, note that for any row <span class="math inline">\(i\)</span>, the first entry was:</p>
+<p><span class="math display">\[
+z_{i,1} = a_{1,1} x_{i,1} + a_{2,1} x_{i,2}
+\]</span></p>
 <p>with <span class="math inline">\(a_{1,1} = \cos\theta\)</span> and <span class="math inline">\(a_{2,1} = -\sin\theta\)</span>.</p>
 <p>The second entry was also a linear transformation:</p>
 <p><span class="math display">\[z_{i,2} = a_{1,2} x_{i,1} + a_{2,2} x_{i,2}\]</span></p>
@@ -558,7 +561,7 @@ <h1 class="title">
 x_1\\x_2
 \end{pmatrix}
 \]</span></p>
-<p>An advantage of using linear algebra is that we can write the transformation for the entire dataset by saving all observations in a <span class="math inline">\(N \times 2\)</span> matrix</p>
+<p>An advantage of using linear algebra is that we can write the transformation for the entire dataset by saving all observations in a <span class="math inline">\(N \times 2\)</span> matrix:</p>
 <p><span class="math display">\[
 \mathbf{X} \equiv
 \begin{bmatrix}
@@ -572,7 +575,7 @@ <h1 class="title">
 x_{n,1}&amp;x_{n,2}
 \end{bmatrix}
 \]</span></p>
-<p>We can then obtained the rotated values <span class="math inline">\(\mathbf{z}_i\)</span> for each row <span class="math inline">\(i\)</span> by applying a <em>linear transformation</em> of <span class="math inline">\(X\)</span>:</p>
+<p>We can then obtain the rotated values <span class="math inline">\(\mathbf{z}_i\)</span> for each row <span class="math inline">\(i\)</span> by applying a <em>linear transformation</em> of <span class="math inline">\(X\)</span>:</p>
 <p><span class="math display">\[
 \mathbf{Z} = \mathbf{X} \mathbf{A}
 \mbox{ with }
@@ -587,7 +590,7 @@ <h1 class="title">
 \end{pmatrix}
 .
 \]</span></p>
-<p>If we define</p>
+<p>If we define:</p>
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-8_bdfd4e301923ae40156dafc749dba25f">
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">theta</span> <span class="op">&lt;-</span> <span class="fl">2</span><span class="op">*</span><span class="va">pi</span><span class="op">*</span><span class="op">-</span><span class="fl">45</span><span class="op">/</span><span class="fl">360</span> <span class="co">#convert to radians</span></span>
 <span><span class="va">A</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Trig.html">cos</a></span><span class="op">(</span><span class="va">theta</span><span class="op">)</span>, <span class="op">-</span><span class="fu"><a href="https://rdrr.io/r/base/Trig.html">sin</a></span><span class="op">(</span><span class="va">theta</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/Trig.html">sin</a></span><span class="op">(</span><span class="va">theta</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/Trig.html">cos</a></span><span class="op">(</span><span class="va">theta</span><span class="op">)</span><span class="op">)</span>, <span class="fl">2</span>, <span class="fl">2</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -600,15 +603,15 @@ <h1 class="title">
 <span>  <span class="va">x</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="va">A</span></span>
 <span><span class="op">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The columns of <span class="math inline">\(\mathbf{A}\)</span> are referred to as <em>directions</em> because if we draw a vector from <span class="math inline">\((0,0)\)</span> to <span class="math inline">\((a_{1,j}, a_{2,j})\)</span> it points in the direction of the line that will become the <span class="math inline">\(j-th\)</span> dimension.</p>
-<p>Another advantage of linear algebra is that if we can find the inverse matrix of <span class="math inline">\(\mathbf{A}^\top\)</span> we can convert <span class="math inline">\(\mathbf{Z}\)</span> back to <span class="math inline">\(\mathbf{X}\)</span> again using a linear transformation.</p>
-<p>In this particular case we can use trigonometry to show that</p>
+<p>The columns of <span class="math inline">\(\mathbf{A}\)</span> are referred to as <em>directions</em> because if we draw a vector from <span class="math inline">\((0,0)\)</span> to <span class="math inline">\((a_{1,j}, a_{2,j})\)</span>, it points in the direction of the line that will become the <span class="math inline">\(j-th\)</span> dimension.</p>
+<p>Another advantage of linear algebra is that if we can find the inverse matrix of <span class="math inline">\(\mathbf{A}^\top\)</span>, we can convert <span class="math inline">\(\mathbf{Z}\)</span> back to <span class="math inline">\(\mathbf{X}\)</span> again using a linear transformation.</p>
+<p>In this particular case, we can use trigonometry to show that:</p>
 <p><span class="math display">\[
 x_{i,1} = b_{1,1} z_{i,1} + b_{2,1} z_{i,2}\\
 x_{i,2} = b_{1,2} z_{i,1} + b_{2,2} z_{i,2}
 \]</span></p>
 <p>with <span class="math inline">\(b_{2,1} = \cos\theta\)</span>, <span class="math inline">\(b_{2,1} = \sin\theta\)</span>, <span class="math inline">\(b_{1,2} = -\sin\theta\)</span>, and <span class="math inline">\(b_{2,2} = \cos\theta\)</span>.</p>
-<p>This implies that</p>
+<p>This implies that:</p>
 <p><span class="math display">\[
 \mathbf{X} = \mathbf{Z}
 \begin{pmatrix}
@@ -619,14 +622,14 @@ <h1 class="title">
 <p><span class="math display">\[
 \mathbf{Z} \mathbf{A}^\top = \mathbf{X} \mathbf{A}\mathbf{A}^\top\ = \mathbf{X}
 \]</span></p>
-<p>and therefore that <span class="math inline">\(\mathbf{A}^\top\)</span> is the inverse of <span class="math inline">\(\mathbf{A}\)</span>. This also implies that all the information in <span class="math inline">\(\mathbf{X}\)</span> is included in the rotation <span class="math inline">\(\mathbf{Z}\)</span>, and it can be retrieved via a linear transformation. A consequence is that for any rotation the distances are preserved. Here is an example for a 30 degree rotation, but it works for any angle:</p>
+<p>and therefore that <span class="math inline">\(\mathbf{A}^\top\)</span> is the inverse of <span class="math inline">\(\mathbf{A}\)</span>. This also implies that all the information in <span class="math inline">\(\mathbf{X}\)</span> is included in the rotation <span class="math inline">\(\mathbf{Z}\)</span>, and it can be retrieved via a linear transformation. A consequence is that for any rotation the distances are preserved. Here is an example for a 30 degree rotation, although it works for any angle:</p>
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-10_62fdcffb4685a9e5396429061e94fa71">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/all.equal.html">all.equal</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">as.matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="fu">rotate</span><span class="op">(</span><span class="va">x</span>, <span class="fl">30</span><span class="op">)</span><span class="op">)</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">as.matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] TRUE</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The next section explains why this happens.</p>
-</section><section id="orthogonal-transformations" class="level2" data-number="21.4"><h2 data-number="21.4" class="anchored" data-anchor-id="orthogonal-transformations">
-<span class="header-section-number">21.4</span> Orthogonal transformations</h2>
+</section><section id="orthogonal-transformations" class="level2" data-number="22.4"><h2 data-number="22.4" class="anchored" data-anchor-id="orthogonal-transformations">
+<span class="header-section-number">22.4</span> Orthogonal transformations</h2>
 <p>Recall that the distance between two points, say rows <span class="math inline">\(h\)</span> and <span class="math inline">\(i\)</span> of the transformation <span class="math inline">\(\mathbf{Z}\)</span>, can be written like this:</p>
 <p><span class="math display">\[
 ||\mathbf{z}_h - \mathbf{z}_i|| = (\mathbf{z}_h - \mathbf{z}_i)^\top(\mathbf{z}_h - \mathbf{z}_i)
@@ -638,18 +641,18 @@ <h1 class="title">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>Remember that we represent the rows of a matrix as column vectors. This explains why we use <span class="math inline">\(\mathbf{A}\)</span> when showing the multiplication for the matrix <span class="math inline">\(\mathbf{Z}=\mathbf{X}\mathbf{A}\)</span> but transpose the operation when showing the transformation for just one observation: <span class="math inline">\(\mathbf{z}_i = \mathbf{A}^\top\mathbf{x}_i\)</span></p>
+<p>Remember that we represent the rows of a matrix as column vectors. This explains why we use <span class="math inline">\(\mathbf{A}\)</span> when showing the multiplication for the matrix <span class="math inline">\(\mathbf{Z}=\mathbf{X}\mathbf{A}\)</span>, but transpose the operation when showing the transformation for just one observation: <span class="math inline">\(\mathbf{z}_i = \mathbf{A}^\top\mathbf{x}_i\)</span></p>
 </div>
 </div>
 </div>
-<p>Using linear algebra, we can rewrite the quantity above as</p>
+<p>Using linear algebra, we can rewrite the quantity above as:</p>
 <p><span class="math display">\[
 ||\mathbf{z}_h - \mathbf{z}_i|| =
 ||\mathbf{A}^\top \mathbf{x}_h - \mathbf{A}^\top\mathbf{x}_i||^2 =
 (\mathbf{x}_h - \mathbf{x}_i)^\top \mathbf{A} \mathbf{A}^\top (\mathbf{x}_h - \mathbf{x}_i)
 \]</span></p>
-<p>Note that if <span class="math inline">\(\mathbf{A} \mathbf{A} ^\top= \mathbf{I}\)</span> then the distance between the <span class="math inline">\(h\)</span>th and <span class="math inline">\(i\)</span>th rows is the same for the original and transformed data.</p>
-<p>We refer to transformation with the property <span class="math inline">\(\mathbf{A} \mathbf{A}^\top = \mathbf{I}\)</span> as an <em>orthogonal transformations</em> and they are guaranteed to preserves the distance between any two points.</p>
+<p>Note that if <span class="math inline">\(\mathbf{A} \mathbf{A} ^\top= \mathbf{I}\)</span>, then the distance between the <span class="math inline">\(h\)</span>th and <span class="math inline">\(i\)</span>th rows is the same for the original and transformed data.</p>
+<p>We refer to transformation with the property <span class="math inline">\(\mathbf{A} \mathbf{A}^\top = \mathbf{I}\)</span> as <em>orthogonal transformations</em>. These are guaranteed to preserve the distance between any two points.</p>
 <p>We previously demonstrated our rotation has this property. We can confirm using R:</p>
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-11_9dfc0f8bea47b9d41340743d7851c362">
 <div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">A</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">A</span><span class="op">)</span></span>
@@ -657,7 +660,7 @@ <h1 class="title">
 <span><span class="co">#&gt; [1,] 1.00e+00 1.01e-17</span></span>
 <span><span class="co">#&gt; [2,] 1.01e-17 1.00e+00</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Notice that <span class="math inline">\(\mathbf{A}\)</span> being orthogonal also guarantees that the total sum of squares (TSS) of <span class="math inline">\(\mathbf{X}\)</span>, defined as <span class="math inline">\(\sum_{i=1}^n \sum_{j=1}^p x_{i,j}^2\)</span> is equal to the total sum of squares of the rotation <span class="math inline">\(\mathbf{Z} = \mathbf{X}\mathbf{A}^\top\)</span>. To show this notice that if we denote the rows of <span class="math inline">\(\mathbf{Z}\)</span> as <span class="math inline">\(\mathbf{z}_1, \dots, \mathbf{z}_n\)</span>, then sum of squares can be written as:</p>
+<p>Notice that <span class="math inline">\(\mathbf{A}\)</span> being orthogonal also guarantees that the total sum of squares (TSS) of <span class="math inline">\(\mathbf{X}\)</span>, defined as <span class="math inline">\(\sum_{i=1}^n \sum_{j=1}^p x_{i,j}^2\)</span> is equal to the total sum of squares of the rotation <span class="math inline">\(\mathbf{Z} = \mathbf{X}\mathbf{A}^\top\)</span>. To illustrate, observe that if we denote the rows of <span class="math inline">\(\mathbf{Z}\)</span> as <span class="math inline">\(\mathbf{z}_1, \dots, \mathbf{z}_n\)</span>, then sum of squares can be written as:</p>
 <p><span class="math display">\[
 \sum_{1=1}^n ||\mathbf{z}_i||^2 = \sum_{i=1}^n ||\mathbf{A}^\top\mathbf{x}_i||^2 = \sum_{i=1}^n \mathbf{x}_i^\top \mathbf{A}\mathbf{A}^\top  \mathbf{x}_i = \sum_{i=1}^n \mathbf{x}_i^\top\mathbf{x}_i = \sum_{i=1}^n||\mathbf{x}_i||^2
 \]</span></p>
@@ -670,21 +673,21 @@ <h1 class="title">
 <span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">z</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 198</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This can be interpreted as a consequence of the fact that orthogonal transformation guarantee that all the information is preserved.</p>
-<p>However, although the total is preserved, the sum of squares for the individual columns changes. Here we compute the proportion of TSS attributed to each column, referred to as the <em>variance explained</em> or <em>variance captured</em> by each column, for <span class="math inline">\(\mathbf{X}\)</span></p>
+<p>This can be interpreted as a consequence of the fact that an orthogonal transformation guarantees that all the information is preserved.</p>
+<p>However, although the total is preserved, the sum of squares for the individual columns changes. Here we compute the proportion of TSS attributed to each column, referred to as the <em>variance explained</em> or <em>variance captured</em> by each column, for <span class="math inline">\(\mathbf{X}\)</span>:</p>
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-13_a3753161e3f32bbf1bd51a07d292df47">
 <div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colSums</a></span><span class="op">(</span><span class="va">x</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">x</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.5 0.5</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>and <span class="math inline">\(\mathbf{Z}\)</span></p>
+<p>and <span class="math inline">\(\mathbf{Z}\)</span>:</p>
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-14_35e99d6d581b5e36c133be2c83edd39b">
 <div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colSums</a></span><span class="op">(</span><span class="va">z</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">z</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.9848 0.0152</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>In the next section we describe how this last mathematical result can be useful.</p>
-</section><section id="sec-pca" class="level2" data-number="21.5"><h2 data-number="21.5" class="anchored" data-anchor-id="sec-pca">
-<span class="header-section-number">21.5</span> Principal Component Analysis (PCA)</h2>
-<p>We have established that orthogonal transformations preserve the the distance between observations and the total sum of squares. We have also established that, while the TSS remains the same, the way this total is distributed across the columns can change.</p>
+<p>In the next section, we describe how this last mathematical result can be useful.</p>
+</section><section id="sec-pca" class="level2" data-number="22.5"><h2 data-number="22.5" class="anchored" data-anchor-id="sec-pca">
+<span class="header-section-number">22.5</span> Principal Component Analysis (PCA)</h2>
+<p>We have established that orthogonal transformations preserve the distance between observations and the total sum of squares. We have also established that, while the TSS remains the same, the way this total is distributed across the columns can change.</p>
 <p>The general idea behind Principal Component Analysis (PCA) is to try to find orthogonal transformations that concentrate the variance explained in the first few columns. We can then focus on these few columns, effectively reducing the dimension of the problem. In our specific example, we are looking for the rotation that maximizes the variance explained in the first column. The following code performs a grid search across rotations from -90 to 0:</p>
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/max-rotation_6acfad6a4808939b6472ebb1ac40577b">
 <div class="cell-output-display">
@@ -700,13 +703,20 @@ <h1 class="title">
 <span><span class="va">variance_explained</span> <span class="op">&lt;-</span> <span class="va">v</span><span class="op">[</span><span class="fl">1</span>,<span class="op">]</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">x</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">angles</span>, <span class="va">variance_explained</span>, type <span class="op">=</span> <span class="st">"l"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We find that a -45 degree rotation appears to achieve the maximum, with over 98% of the total variability explained by the first dimension. We can rotate the entire dataset using:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-16_65bd865d8803f8cc13184bc379c253c6">
-<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">z</span> <span class="op">&lt;-</span> <span class="va">x</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="va">A</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>We find that a -45 degree rotation appears to achieve the maximum, with over 98% of the total variability explained by the first dimension. We denote this rotation matrix with <span class="math inline">\(\mathbf{V}\)</span>:</p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-16_34062cb915280540a60eb5012b06f8e3">
+<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">theta</span> <span class="op">&lt;-</span> <span class="fl">2</span><span class="op">*</span><span class="va">pi</span><span class="op">*</span><span class="op">-</span><span class="fl">45</span><span class="op">/</span><span class="fl">360</span> <span class="co">#convert to radians</span></span>
+<span><span class="va">V</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Trig.html">cos</a></span><span class="op">(</span><span class="va">theta</span><span class="op">)</span>, <span class="op">-</span><span class="fu"><a href="https://rdrr.io/r/base/Trig.html">sin</a></span><span class="op">(</span><span class="va">theta</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/Trig.html">sin</a></span><span class="op">(</span><span class="va">theta</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/Trig.html">cos</a></span><span class="op">(</span><span class="va">theta</span><span class="op">)</span><span class="op">)</span>, <span class="fl">2</span>, <span class="fl">2</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>We can rotate the entire dataset using:</p>
+<p><span class="math display">\[
+\mathbf{Z} = \mathbf{X}\mathbf{V}
+\]</span></p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-17_31c97fc2a10e463c36449b0ff6e6fc22">
+<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">z</span> <span class="op">&lt;-</span> <span class="va">x</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="va">V</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The following animation further illustrates how different rotations affect the variability explained by the dimensions of the rotated data:</p>
-<div class="cell" data-layout-align="center" data-fig.asp="1" data-hash="dimension-reduction_cache/html/unnamed-chunk-17_399525bc84a283191de97206ec6e8b21">
-<pre><code>#&gt; Output at: pca.gif</code></pre>
+<div class="cell" data-layout-align="center" data-fig.asp="1" data-hash="dimension-reduction_cache/html/unnamed-chunk-18_b54098faa3f163e5bb8a6b99f3fd1b1f">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="img/pca.gif" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -723,21 +733,21 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>We also notice that the two groups, adults and children, can be clearly observed with the one number summary, better than with any of the two orginal dimesions.</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-18_800214fdb7e2d5777b6f60bd70af25df">
+<p>We also notice that the two groups, adults and children, can be clearly observed with the one number summary, better than with any of the two original dimensions.</p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-19_303af56ade04d757293092fd69b0b753">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="dimension-reduction_files/figure-html/unnamed-chunk-18-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
+<figure class="figure"><p><img src="dimension-reduction_files/figure-html/unnamed-chunk-19-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
 </figure>
 </div>
 </div>
 </div>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-19_bb32e53a5bab11f486ca53553cfd15a5">
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-20_291d076a45659fe51993b49f74b5d46c">
 <div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span>,<span class="fl">1</span><span class="op">]</span>, breaks <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="op">-</span><span class="fl">4</span>,<span class="fl">4</span>,<span class="fl">0.5</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span>,<span class="fl">2</span><span class="op">]</span>, breaks <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="op">-</span><span class="fl">4</span>,<span class="fl">4</span>,<span class="fl">0.5</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">z</span><span class="op">[</span>,<span class="fl">1</span><span class="op">]</span>, breaks <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="op">-</span><span class="fl">4</span>,<span class="fl">4</span>,<span class="fl">0.5</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can visualize these to see how the first component summarizes the data. In the plot below red represents high values and blue negative values:</p>
+<p>We can visualize these to see how the first component summarizes the data. In the plot below, red represents high values and blue negative values:</p>
 <div class="cell" data-layout-align="center" height="5" data-hash="dimension-reduction_cache/html/illustrate-pca-twin-heights_f45b9d5d17ca2b561c3de6d50b0cddfd">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -746,49 +756,49 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>This idea generalizes to dimensions higher than 2. As we did in our two dimensional example, we start by finding the <span class="math inline">\(p\times1\)</span> vector <span class="math inline">\(\mathbf{a}_1\)</span> with<span class="math inline">\(||\mathbf{a}_1||=1\)</span> that maximizes <span class="math inline">\(||\mathbf{X} \mathbf{a}_1||\)</span>. <span class="math inline">\(\mathbf{X} \mathbf{a}_1\)</span> is the first PC. To find the second PC, we subtract the variation explained by first PC from <span class="math inline">\(\mathbf{X}\)</span>:</p>
+<p>This idea generalizes to dimensions higher than 2. As done in our two dimensional example, we start by finding the <span class="math inline">\(p \times 1\)</span> vector <span class="math inline">\(\mathbf{v}_1\)</span> with <span class="math inline">\(||\mathbf{v}_1||=1\)</span> that maximizes <span class="math inline">\(||\mathbf{X} \mathbf{v}_1||\)</span>. The projection <span class="math inline">\(\mathbf{X} \mathbf{v}_1\)</span> is the first PC. To find the second PC, we subtract the variation explained by first PC from <span class="math inline">\(\mathbf{X}\)</span>:</p>
 <p><span class="math display">\[
-\mathbf{r} = \mathbf{X} - \mathbf{X} \mathbf{a}_1 \mathbf{a}_1^\top
+\mathbf{r} = \mathbf{X} - \mathbf{X} \mathbf{v}_1 \mathbf{v}_1^\top
 \]</span></p>
-<p>and then find the vector <span class="math inline">\(\mathbf{a}_2\)</span> with<span class="math inline">\(||\mathbf{a}_2||=1\)</span> that maximizes <span class="math inline">\(||\mathbf{r} \mathbf{a}_2||\)</span>. <span class="math inline">\(\mathbf{X} \mathbf{a}_2\)</span> is the second PC. We then subtract the variation explained by the first two PCs, and continue this process until we have the entire <em>rotation</em> matrix and matrix of principal components, respectively:</p>
+<p>and then find the vector <span class="math inline">\(\mathbf{v}_2\)</span> with<span class="math inline">\(||\mathbf{v}_2||=1\)</span> that maximizes <span class="math inline">\(||\mathbf{r} \mathbf{v}_2||\)</span>. The projection <span class="math inline">\(\mathbf{X} \mathbf{v}_2\)</span> is the second PC. We then subtract the variation explained by the first two PCs, and continue this process until we have the entire <em>rotation</em> matrix and matrix of principal components, respectively:</p>
 <p><span class="math display">\[
-\mathbf{A} =
+\mathbf{V} =
 \begin{bmatrix}
-\mathbf{a}_1&amp;\dots&amp;\mathbf{a}_p
+\mathbf{v}_1&amp;\dots&amp;\mathbf{v}_p
 \end{bmatrix},
-\mathbf{Z} = \mathbf{X}\mathbf{A}
+\mathbf{Z} = \mathbf{X}\mathbf{V}
 \]</span></p>
-<p>The ideas of distance preervation extends to higher dimensions. For a multidimensional matrix with <span class="math inline">\(p\)</span> columns, the <span class="math inline">\(\mathbf{A}\)</span> transformation preserves distance between rows, but with the variance exaplined by the columns in decreasing order. If the variances of the columns <span class="math inline">\(\mathbf{Z}_j\)</span>, <span class="math inline">\(j&gt;k\)</span> are very small, these dimensions have little to contribute to the distance calculation and we can approximate distance between any two points with just <span class="math inline">\(k\)</span> dimensions. If <span class="math inline">\(k\)</span> is much smaller than <span class="math inline">\(p\)</span>, then we can achieve a very efficient summary of our data.</p>
+<p>The ideas of distance preservation extends to higher dimensions. For a multidimensional matrix with <span class="math inline">\(p\)</span> columns, the <span class="math inline">\(\mathbf{A}\)</span> transformation preserves the distance between rows, but with the variance explained by the columns in decreasing order.If the variances of the columns <span class="math inline">\(\mathbf{Z}_j\)</span>, <span class="math inline">\(j&gt;k\)</span> are very small, these dimensions have little to contribute to the distance calculation and we can approximate the distance between any two points with just <span class="math inline">\(k\)</span> dimensions. If <span class="math inline">\(k\)</span> is much smaller than <span class="math inline">\(p\)</span>, then we can achieve a very efficient summary of our data.</p>
 <div class="callout callout-style-simple callout-warning">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>Notice that the solution to this maximization problem is not unique because <span class="math inline">\(||\mathbf{X} \mathbf{a}|| = ||-\mathbf{X} \mathbf{a}||\)</span>. Also note that if we multiply a column of <span class="math inline">\(\mathbf{A}\)</span> by <span class="math inline">\(-1\)</span> we still represent <span class="math inline">\(\mathbf{X}\)</span> as <span class="math inline">\(\mathbf{Z}\mathbf{A}^\top\)</span> as long as we also multiple the corresponsing column of <span class="math inline">\(\matbf{Z}\)</span> by -1. This implies that sign of each column of the rotation <span class="math inline">\(\mathbf{A}\)</span> and principal component matrix <span class="math inline">\(\mathbf{Z}\)</span> is arbitrary.</p>
+<p>Notice that the solution to this maximization problem is not unique because <span class="math inline">\(||\mathbf{X} \mathbf{v}|| = ||-\mathbf{X} \mathbf{v}||\)</span>. Also, note that if we multiply a column of <span class="math inline">\(\mathbf{A}\)</span> by <span class="math inline">\(-1\)</span>, we still represent <span class="math inline">\(\mathbf{X}\)</span> as <span class="math inline">\(\mathbf{Z}\mathbf{V}^\top\)</span> as long as we also multiple the corresponding column of <span class="math inline">\(\mathbf{V}\)</span> by -1. This implies that we can arbitrarily change the sign of each column of the rotation <span class="math inline">\(\mathbf{V}\)</span> and principal component matrix <span class="math inline">\(\mathbf{Z}\)</span>.</p>
 </div>
 </div>
 </div>
-<p>In R we can find the principal components of any matrix with the function <code>prcomp</code>:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-20_aee189d54ba8263e8fd138d7ef5fe45d">
+<p>In R, we can find the principal components of any matrix with the function <code>prcomp</code>:</p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-21_afcd7a3c1e76569f9fd76aeef00a6e35">
 <div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pca</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/prcomp.html">prcomp</a></span><span class="op">(</span><span class="va">x</span>, center <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Note that default behavior is to center the columns of <code>x</code> before computing the PCs, an operation we don’t need because our matrix is scaled.</p>
-<p>The object <code>pca</code> includes the rotated data <span class="math inline">\(Z\)</span> in <code>pca$x</code> and the rotation <span class="math inline">\(\mathbf{A}\)</span> in <code>pca$rotation</code>.</p>
-<p>We can see that columns of the <code>pca$rotation</code> are indeed the rotation obtained with -45 (remember the sign is arbitrary)</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-21_21ad457da737d84529457b25aaf7b50d">
+<p>Keep in mind that default behavior is to center the columns of <code>x</code> before computing the PCs, an operation we don’t need because our matrix is scaled.</p>
+<p>The object <code>pca</code> includes the rotated data <span class="math inline">\(Z\)</span> in <code>pca$x</code> and the rotation <span class="math inline">\(\mathbf{V}\)</span> in <code>pca$rotation</code>.</p>
+<p>We can see that columns of the <code>pca$rotation</code> are indeed the rotation obtained with -45 (remember the sign is arbitrary):</p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-22_c9f78cfc96d17ac6bc033d49679a522a">
 <div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pca</span><span class="op">$</span><span class="va">rotation</span></span>
 <span><span class="co">#&gt;         PC1    PC2</span></span>
 <span><span class="co">#&gt; [1,] -0.707  0.707</span></span>
 <span><span class="co">#&gt; [2,] -0.707 -0.707</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The sqaure root of the variation of each column is included in the <code>pca$sdev</code> component. This implies we can compute the variance explained by each PC using:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-22_dbde0767ebaff76c544668cede523144">
+<p>The square root of the variation of each column is included in the <code>pca$sdev</code> component. This implies we can compute the variance explained by each PC using:</p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-23_15974e83b82dff0a7071950582410bd7">
 <div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pca</span><span class="op">$</span><span class="va">sdev</span><span class="op">^</span><span class="fl">2</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">pca</span><span class="op">$</span><span class="va">sdev</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.9848 0.0152</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The function <code>summary</code> performs this calculation for us:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-23_0f9ca25993820f0e2405b1fbe01c6de9">
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-24_aa60675815d8666aa30a21aadd62680b">
 <div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/summary.html">summary</a></span><span class="op">(</span><span class="va">pca</span><span class="op">)</span></span>
 <span><span class="co">#&gt; Importance of components:</span></span>
 <span><span class="co">#&gt;                          PC1    PC2</span></span>
@@ -796,26 +806,26 @@ <h1 class="title">
 <span><span class="co">#&gt; Proportion of Variance 0.985 0.0152</span></span>
 <span><span class="co">#&gt; Cumulative Proportion  0.985 1.0000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We also see that we can transform between <code>x</code> (<span class="math inline">\(\mathbf{X}\)</span>) and <code>pca$x</code> (<span class="math inline">\(\mathbf{Z}\)</span>) as explained with mathematical formulas above:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-24_9315a5b7bf40ede37080f8cedf0bcecf">
+<p>We also see that we can rotate <code>x</code> (<span class="math inline">\(\mathbf{X}\)</span>) and <code>pca$x</code> (<span class="math inline">\(\mathbf{Z}\)</span>) as explained with the mathematical formulas above:</p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-25_c27cbe175f6b086013932dcbb68bd563">
 <div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/all.equal.html">all.equal</a></span><span class="op">(</span><span class="va">pca</span><span class="op">$</span><span class="va">x</span>, <span class="va">x</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="va">pca</span><span class="op">$</span><span class="va">rotation</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] TRUE</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/all.equal.html">all.equal</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">pca</span><span class="op">$</span><span class="va">x</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">pca</span><span class="op">$</span><span class="va">rotation</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] TRUE</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="examples" class="level2" data-number="21.6"><h2 data-number="21.6" class="anchored" data-anchor-id="examples">
-<span class="header-section-number">21.6</span> Examples</h2>
-<section id="iris-example" class="level3" data-number="21.6.1"><h3 data-number="21.6.1" class="anchored" data-anchor-id="iris-example">
-<span class="header-section-number">21.6.1</span> Iris example</h3>
+</section><section id="examples" class="level2" data-number="22.6"><h2 data-number="22.6" class="anchored" data-anchor-id="examples">
+<span class="header-section-number">22.6</span> Examples</h2>
+<section id="iris-example" class="level3" data-number="22.6.1"><h3 data-number="22.6.1" class="anchored" data-anchor-id="iris-example">
+<span class="header-section-number">22.6.1</span> Iris example</h3>
 <p>The iris data is a widely used example in data analysis courses. It includes four botanical measurements related to three flower species:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-25_219056fc4ea02e4471430149876c54eb">
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-26_6c67c66694c1be7027e48a473fe2b4a8">
 <div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">iris</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" </span></span>
 <span><span class="co">#&gt; [5] "Species"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>If you print <code>iris$Species</code> you will see that the data is ordered by the species.</p>
-<p>If we visualize the distances we can clearly see the three species with one species very different from the other two:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-26_1fb9b36f3fe564de4fe07e3e59ad1b4b">
+<p>If you print <code>iris$Species</code>, you will see that the data is ordered by the species.</p>
+<p>If we visualize the distances, we can clearly see the three species with one species very different from the other two:</p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-27_459d89fe116cecfbc47db68aae054166">
 <div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">iris</span><span class="op">[</span>,<span class="fl">1</span><span class="op">:</span><span class="fl">4</span><span class="op">]</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">as.matrix</a></span><span class="op">(</span><span class="op">)</span></span>
 <span><span class="va">d</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">as.matrix</a></span><span class="op">(</span><span class="va">d</span><span class="op">)</span>, col <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/rev.html">rev</a></span><span class="op">(</span><span class="fu">RColorBrewer</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/RColorBrewer/man/ColorBrewer.html">brewer.pal</a></span><span class="op">(</span><span class="fl">9</span>, <span class="st">"RdBu"</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -829,7 +839,7 @@ <h1 class="title">
 </div>
 </div>
 <p>Our features matrix has four dimensions, but three are very correlated:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-27_2fec3964c6835b394e3e5cb39e16c412">
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-28_dbd84116a1de10fd8248c47862efb994">
 <div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="co">#&gt;              Sepal.Length Sepal.Width Petal.Length Petal.Width</span></span>
 <span><span class="co">#&gt; Sepal.Length        1.000      -0.118        0.872       0.818</span></span>
@@ -837,8 +847,8 @@ <h1 class="title">
 <span><span class="co">#&gt; Petal.Length        0.872      -0.428        1.000       0.963</span></span>
 <span><span class="co">#&gt; Petal.Width         0.818      -0.366        0.963       1.000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>If we apply PCA, we should be able to approximate this distance with just two dimensions, compressing the highly correlated dimensions. Using the <code>summary</code> function we can see the variability explained by each PC:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-28_8c0f9efd13e2c62286de447b022eea52">
+<p>If we apply PCA, we should be able to approximate this distance with just two dimensions, compressing the highly correlated dimensions. Using the <code>summary</code> function, we can see the variability explained by each PC:</p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-29_a78b71e6520b2eb652e8b7dcc6138a9c">
 <div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pca</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/prcomp.html">prcomp</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/summary.html">summary</a></span><span class="op">(</span><span class="va">pca</span><span class="op">)</span></span>
 <span><span class="co">#&gt; Importance of components:</span></span>
@@ -847,8 +857,8 @@ <h1 class="title">
 <span><span class="co">#&gt; Proportion of Variance 0.925 0.0531 0.0171 0.00521</span></span>
 <span><span class="co">#&gt; Cumulative Proportion  0.925 0.9777 0.9948 1.00000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The first two dimensions account for almot 98% of the variability. Thus we should be able to approximate the distance very well with two dimensions. We confirm this by computing the distance from first two dimensions and comparing to the original:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-29_b90479fca6d7e6840ad97aca492e5532">
+<p>The first two dimensions account for almost 98% of the variability. Thus, we should be able to approximate the distance very well with two dimensions. We confirm this by computing the distance from first two dimensions and comparing to the original:</p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-30_bb010a87bc3fa9c0ea7d8c05fb9eb288">
 <div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">d_approx</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="va">pca</span><span class="op">$</span><span class="va">x</span><span class="op">[</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">]</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">d</span>, <span class="va">d_approx</span><span class="op">)</span>; <span class="fu"><a href="https://rdrr.io/r/graphics/abline.html">abline</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span>, col <span class="op">=</span> <span class="st">"red"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
@@ -873,8 +883,8 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>We color the the observation by their labels and notice that with these two dimensions we achieve almost perfect separation.</p>
-<p>Looking more closely at the resulting PCs and rotaions:</p>
+<p>We color the observations by their labels and notice that, with these two dimensions, we achieve almost perfect separation.</p>
+<p>Looking more closely at the resulting PCs and rotations:</p>
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/illustrate-pca-twin-heights-iris_73ce04865dd49e2fa8359f13ee5df30c">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -883,18 +893,18 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>we learn that the first PC is obtained by taking a weighted avarge of sepal length, petal length, and petal width, since these are red in first column, and subtracts a weighted sepal width, since this is blue. The second PC is a weighted average of weighted average of petal length and petal width minus a weighted average of sepal length and petal width.</p>
-</section><section id="mnist-example" class="level3" data-number="21.6.2"><h3 data-number="21.6.2" class="anchored" data-anchor-id="mnist-example">
-<span class="header-section-number">21.6.2</span> MNIST example</h3>
+<p>we learn that the first PC is obtained by taking a weighted average of sepal length, petal length, and petal width (red in first column), and subtracting a a quantity proportional to sepal width (blue in first column). The second PC is a weighted average of petal length and petal width, minus a weighted average of sepal length and petal width.</p>
+</section><section id="mnist-example" class="level3" data-number="22.6.2"><h3 data-number="22.6.2" class="anchored" data-anchor-id="mnist-example">
+<span class="header-section-number">22.6.2</span> MNIST example</h3>
 <p>The written digits example has 784 features. Is there any room for data reduction? We will use PCA to answer this.</p>
-<p>Let’s load the data if not already loaded:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-30_f38fa6e90c736a4e37b1bd6a036b4f08">
+<p>If not already loaded, let’s begin by loading the data:</p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-31_78511afa978d2cccb03a5de5e5b7bcbe">
 <div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
 <span><span class="kw">if</span> <span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/exists.html">exists</a></span><span class="op">(</span><span class="st">"mnist"</span><span class="op">)</span><span class="op">)</span> <span class="va">mnist</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/dslabs/man/read_mnist.html">read_mnist</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Because the pixels are so small, we expect pixels close to each other on the grid to be correlated, meaning that dimension reduction should be possible.</p>
-<p>Let’s compute the PCs. This will take a few seconds as it is a rather large matrix.</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-31_04c086c67993bf24a4d5286ad9d5338d">
+<p>Let’s compute the PCs. This will take a few seconds as it is a rather large matrix:</p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-32_2f35022659a91252654daaa884476837">
 <div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pca</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/prcomp.html">prcomp</a></span><span class="op">(</span><span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">images</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/mnist-pca-variance-explained_fb59ce36bb026e4a670439e5f13ffdf3">
@@ -905,16 +915,12 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-32_c6a4f9eb92583cb273271a814af17bbd">
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-33_97b9cabc0224dd55acb54c61bd4ecbdb">
 <div class="sourceCode" id="cb30"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">pca</span><span class="op">$</span><span class="va">sdev</span><span class="op">^</span><span class="fl">2</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">pca</span><span class="op">$</span><span class="va">sdev</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span>, xlab <span class="op">=</span> <span class="st">"PC"</span>, ylab <span class="op">=</span> <span class="st">"Variance explained"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can see that the first few PCs already explain a large percent of the variability:</p>
-<p>And just by looking at the first two PCs we already see information about the labels. Here is a random sample of 500 digits:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/mnist-pca-1-2-scatter_3caae91f6d4ff5ac126fbb956f7f3b7a">
-<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>PC1 <span class="op">=</span> <span class="va">pca</span><span class="op">$</span><span class="va">x</span><span class="op">[</span>,<span class="fl">1</span><span class="op">]</span>, PC2 <span class="op">=</span> <span class="va">pca</span><span class="op">$</span><span class="va">x</span><span class="op">[</span>,<span class="fl">2</span><span class="op">]</span>, label <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">label</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu">sample_n</span><span class="op">(</span><span class="fl">500</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu">ggplot</span><span class="op">(</span><span class="fu">aes</span><span class="op">(</span><span class="va">PC1</span>, <span class="va">PC2</span>, fill <span class="op">=</span> <span class="va">label</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu">geom_point</span><span class="op">(</span>cex <span class="op">=</span> <span class="fl">3</span>, pch <span class="op">=</span> <span class="fl">21</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>We can see that the first few PCs already explain a large percent of the variability.</p>
+<p>Simply by looking at the first two PCs we already see information about the labels. Here is a random sample of 500 digits:</p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/mnist-pca-1-2-scatter_7055326f81dda49c4c2f86ac73c13329">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="dimension-reduction_files/figure-html/mnist-pca-1-2-scatter-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -922,8 +928,8 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>We can also <em>see</em> the linear combinations on the grid to get an idea of how pixels are getting to compute the first four principal components:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/mnist-pca-1-4_74dd6e575ed4e503ab64d84fa6c28534">
+<p>We can also <em>see</em> the rotation values on the 28 <span class="math inline">\(\times\)</span> 28 grid to get an idea of how pixels are being weighted in the transformations that result in the PCs:</p>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/mnist-pca-1-4_af5f38da134dfcdfa8bc4ea0ce526a7f">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="dimension-reduction_files/figure-html/mnist-pca-1-4-1.png" class="img-fluid figure-img" style="width:100.0%"></p>
@@ -931,7 +937,7 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>We can clearly see that first PC appears to be separating the 1s (red) from the 0s (blue). We can kind of make out numbers in the other three PCs as well. By looking at the PCs stratified by digit we get futher insights. For example, we see that the second PC separates 4s, 7s, and 9s from the rest:</p>
+<p>We can clearly see that first PC appears to be separating the 1s (red) from the 0s (blue). We can vaguely discern digits, or parts of digits, in the other three PCs as well. By looking at the PCs stratified by digits, we get further insights. For example, we see that the second PC separates 4s, 7s, and 9s from the rest:</p>
 <div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/digit-pc-boxplot_23220bc9892edfe6f88645f5e1e6c0e7">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -941,7 +947,7 @@ <h1 class="title">
 </div>
 </div>
 <p>We can also confirm that the lower variance PCs appear related to unimportant variability, mainly smudges in the corners:</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/mnist-pca-last,_10c89bcabdb28ba4cf6af27683c5fac6">
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/mnist-pca-last,_46d043f297f75a0567ee5ccbbd2e4062">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="dimension-reduction_files/figure-html/mnist-pca-last,-1.png" class="img-fluid figure-img" style="width:100.0%"></p>
@@ -949,17 +955,17 @@ <h1 class="title">
 </div>
 </div>
 </div>
-</section></section><section id="exercises" class="level2" data-number="21.7"><h2 data-number="21.7" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">21.7</span> Exercises</h2>
+</section></section><section id="exercises" class="level2" data-number="22.7"><h2 data-number="22.7" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">22.7</span> Exercises</h2>
 <p>1. We want to explore the <code>tissue_gene_expression</code> predictors by plotting them.</p>
-<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-33_d5896d034cd0cca8661e1474e0f6c0be">
-<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="dimension-reduction_cache/html/unnamed-chunk-34_07aa2f067539c8ae29210018fbfbe9b7">
+<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We want to get an idea of which observations are close to each other, but the predictors are 500-dimensional so plotting is difficult. Plot the first two principal components with color representing tissue type.</p>
+<p>We hope to get an idea of which observations are close to each other, but the predictors are 500-dimensional so plotting is difficult. Plot the first two principal components with color representing tissue type.</p>
 <p>2. The predictors for each observation are measured on the same measurement device (a gene expression microarray) after an experimental procedure. A different device and procedure is used for each observation. This may introduce biases that affect all predictors for each observation in the same way. To explore the effect of this potential bias, for each observation, compute the average across all predictors and then plot this against the first PC with color representing tissue. Report the correlation.</p>
-<p>3. We see an association with the first PC and the observation averages. Redo the PCA but only after removing the center.</p>
+<p>3. We see an association with the first PC and the observation averages. Redo the PCA, but only after removing the center.</p>
 <p>4. For the first 10 PCs, make a boxplot showing the values for each tissue.</p>
-<p>5. Plot the percent variance explained by PC number. Hint: use the <code>summary</code> function.</p>
+<p>5. Plot the percent variance explained by PC number. Hint: Use the <code>summary</code> function.</p>
 
 
 </section></main><!-- /main --><script id="quarto-html-after-body" type="application/javascript">
@@ -1196,12 +1202,12 @@ <h1 class="title">
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../highdim/linear-algebra.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../highdim/regularization.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/highdim/dimension-reduction_files/figure-html/dist-approx-4-1.png b/docs/highdim/dimension-reduction_files/figure-html/dist-approx-4-1.png
index c020cb6..5977f55 100644
Binary files a/docs/highdim/dimension-reduction_files/figure-html/dist-approx-4-1.png and b/docs/highdim/dimension-reduction_files/figure-html/dist-approx-4-1.png differ
diff --git a/docs/highdim/dimension-reduction_files/figure-html/illustrate-pca-twin-heights-1.png b/docs/highdim/dimension-reduction_files/figure-html/illustrate-pca-twin-heights-1.png
index 3ce4964..0502606 100644
Binary files a/docs/highdim/dimension-reduction_files/figure-html/illustrate-pca-twin-heights-1.png and b/docs/highdim/dimension-reduction_files/figure-html/illustrate-pca-twin-heights-1.png differ
diff --git a/docs/highdim/dimension-reduction_files/figure-html/illustrate-pca-twin-heights-iris-1.png b/docs/highdim/dimension-reduction_files/figure-html/illustrate-pca-twin-heights-iris-1.png
index cfc242a..ce20867 100644
Binary files a/docs/highdim/dimension-reduction_files/figure-html/illustrate-pca-twin-heights-iris-1.png and b/docs/highdim/dimension-reduction_files/figure-html/illustrate-pca-twin-heights-iris-1.png differ
diff --git a/docs/highdim/dimension-reduction_files/figure-html/max-rotation-1.png b/docs/highdim/dimension-reduction_files/figure-html/max-rotation-1.png
index 0e30107..cebf8e7 100644
Binary files a/docs/highdim/dimension-reduction_files/figure-html/max-rotation-1.png and b/docs/highdim/dimension-reduction_files/figure-html/max-rotation-1.png differ
diff --git a/docs/highdim/dimension-reduction_files/figure-html/mnist-pca-1-2-scatter-1.png b/docs/highdim/dimension-reduction_files/figure-html/mnist-pca-1-2-scatter-1.png
index e81794c..afc8461 100644
Binary files a/docs/highdim/dimension-reduction_files/figure-html/mnist-pca-1-2-scatter-1.png and b/docs/highdim/dimension-reduction_files/figure-html/mnist-pca-1-2-scatter-1.png differ
diff --git a/docs/highdim/dimension-reduction_files/figure-html/mnist-pca-variance-explained-1.png b/docs/highdim/dimension-reduction_files/figure-html/mnist-pca-variance-explained-1.png
index b53692f..6096c64 100644
Binary files a/docs/highdim/dimension-reduction_files/figure-html/mnist-pca-variance-explained-1.png and b/docs/highdim/dimension-reduction_files/figure-html/mnist-pca-variance-explained-1.png differ
diff --git a/docs/highdim/dimension-reduction_files/figure-html/unnamed-chunk-16-1.png b/docs/highdim/dimension-reduction_files/figure-html/unnamed-chunk-16-1.png
deleted file mode 100644
index 21778e3..0000000
Binary files a/docs/highdim/dimension-reduction_files/figure-html/unnamed-chunk-16-1.png and /dev/null differ
diff --git a/docs/highdim/dimension-reduction_files/figure-html/unnamed-chunk-7-1.png b/docs/highdim/dimension-reduction_files/figure-html/unnamed-chunk-7-1.png
index be2e591..f631e46 100644
Binary files a/docs/highdim/dimension-reduction_files/figure-html/unnamed-chunk-7-1.png and b/docs/highdim/dimension-reduction_files/figure-html/unnamed-chunk-7-1.png differ
diff --git a/docs/highdim/dimension-reduction_files/figure-html/unnamed-chunk-8-1.png b/docs/highdim/dimension-reduction_files/figure-html/unnamed-chunk-8-1.png
deleted file mode 100644
index f631e46..0000000
Binary files a/docs/highdim/dimension-reduction_files/figure-html/unnamed-chunk-8-1.png and /dev/null differ
diff --git a/docs/highdim/intro-highdim.html b/docs/highdim/intro-highdim.html
index f614505..6837695 100644
--- a/docs/highdim/intro-highdim.html
+++ b/docs/highdim/intro-highdim.html
@@ -205,23 +205,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -238,37 +244,37 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -285,31 +291,31 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -326,49 +332,49 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -401,9 +407,9 @@ <h1 class="title">High dimensional data</h1>
 
 </header>
 
-<p>There is a variety of computational techniques and statistical concepts that are useful for analysis of datasets for which each observation is associated with a large number of numerical variables. In this chapter we provide a basic introduction to these techniques and concepts by describing matrix operations in R, dimension reduction, regularization, and matrix factorization. Handwritten digits data and movie recommendation systems serve as motivating examples.</p>
-<p>A task that serves as motivation for this part of the book is quantifying the similarity between any two observations. For example, we might want to know how much two handwritten digits look like each other. However, note that each observations is associated with <span class="math inline">\(28 \times 28 = 784\)</span> pixels so we can’t simply use subtraction as we would do if our data was one dimensional. Instead, we will define observations as <em>points</em> in a <em>high-dimensional</em> space and mathematically define a <em>distance</em>. Many machine learning techniques, discussed in the next part of the book, require this calculation.</p>
-<p>Additionally, this part of the book discusses dimension reduction. Here we search of data summaries that result in more manageable lower dimension versions of the data, but preserve most or all the <em>information</em> we need. Here too we can use distance between observations as specific challenge: we will reduce the dimensions summarize the data into lower dimensions, but in a way that preserves the distance between any two observations. We use <em>linear algebra</em> as a mathematical foundation for all the techniques presented here.</p>
+<p>There is a variety of computational techniques and statistical concepts that are useful for analysis of datasets for which each observation is associated with a large number of numerical variables. In this chapter, we provide a basic introduction to these techniques and concepts by describing matrix operations in R, dimension reduction, regularization, and matrix factorization. Handwritten digits data and movie recommendation systems serve as motivating examples.</p>
+<p>A task that serves as motivation for this part of the book is quantifying the similarity between any two observations. For example, we might want to know how much two handwritten digits look like each other. However, note that each observation is associated with <span class="math inline">\(28 \times 28 = 784\)</span> pixels so we can’t simply use subtraction as we would if our data was one dimensional. Instead, we will define observations as <em>points</em> in a <em>high-dimensional</em> space and mathematically define a <em>distance</em>. Many machine learning techniques, discussed in the next part of the book, require this calculation.</p>
+<p>Additionally, this part of the book discusses dimension reduction. Here we search for data summaries that provide more manageable lower dimension versions of the data, but preserve most or all the <em>information</em> we need. We again use distance between observations as a specific example: we will summarize the data into lower dimensions, but in a way that preserves distance between any two observations. We use <em>linear algebra</em> as a mathematical foundation for all the techniques presented here.</p>
 
 
 
@@ -644,12 +650,12 @@ <h1 class="title">High dimensional data</h1>
 <nav class="page-navigation">
   <div class="nav-page nav-page-previous">
       <a href="../linear-models/association-not-causation.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../highdim/matrices-in-R.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/highdim/linear-algebra.html b/docs/highdim/linear-algebra.html
index 14e32a6..08dc08c 100644
--- a/docs/highdim/linear-algebra.html
+++ b/docs/highdim/linear-algebra.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 20&nbsp; Applied Linear Algebra</title>
+<title>Advanced Data Science - 21&nbsp; Applied Linear Algebra</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../highdim/intro-highdim.html">High dimensional data</a></li><li class="breadcrumb-item"><a href="../highdim/linear-algebra.html"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../highdim/intro-highdim.html">High dimensional data</a></li><li class="breadcrumb-item"><a href="../highdim/linear-algebra.html"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,16 +405,16 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#matrix-multiplication" id="toc-matrix-multiplication" class="nav-link active" data-scroll-target="#matrix-multiplication"><span class="header-section-number">20.1</span> Matrix multiplication</a></li>
-  <li><a href="#the-identity-matrix" id="toc-the-identity-matrix" class="nav-link" data-scroll-target="#the-identity-matrix"><span class="header-section-number">20.2</span> The identity matrix</a></li>
-  <li><a href="#distance" id="toc-distance" class="nav-link" data-scroll-target="#distance"><span class="header-section-number">20.3</span> Distance</a></li>
-  <li><a href="#sec-predictor-space" id="toc-sec-predictor-space" class="nav-link" data-scroll-target="#sec-predictor-space"><span class="header-section-number">20.4</span> Spaces</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">20.5</span> Exercises</a></li>
+<li><a href="#matrix-multiplication" id="toc-matrix-multiplication" class="nav-link active" data-scroll-target="#matrix-multiplication"><span class="header-section-number">21.1</span> Matrix multiplication</a></li>
+  <li><a href="#the-identity-matrix" id="toc-the-identity-matrix" class="nav-link" data-scroll-target="#the-identity-matrix"><span class="header-section-number">21.2</span> The identity matrix</a></li>
+  <li><a href="#distance" id="toc-distance" class="nav-link" data-scroll-target="#distance"><span class="header-section-number">21.3</span> Distance</a></li>
+  <li><a href="#sec-predictor-space" id="toc-sec-predictor-space" class="nav-link" data-scroll-target="#sec-predictor-space"><span class="header-section-number">21.4</span> Spaces</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">21.5</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/highdim/linear-algebra.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
-<h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></h1>
+<h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></h1>
 </div>
 
 
@@ -422,19 +428,19 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
   
 
 </header><p>Linear algebra is the main mathematical technique used to describe and motivate statistical methods and machine learning approaches. In this chapter, we introduce some of the mathematical concepts needed to understand these techniques and demonstrate how to work with matrices in R. We use these concepts and techniques throughout the remainder of the book. We start the chapter with a motivating example.</p>
-<section id="matrix-multiplication" class="level2" data-number="20.1"><h2 data-number="20.1" class="anchored" data-anchor-id="matrix-multiplication">
-<span class="header-section-number">20.1</span> Matrix multiplication</h2>
-<p>A commonly used operation in data analysis is matrix multiplication. Here we define and motivate the operation.</p>
-<p>Linear algebra was born from mathematicians developing systematic ways to solve systems of linear equations, for example</p>
+<section id="matrix-multiplication" class="level2" data-number="21.1"><h2 data-number="21.1" class="anchored" data-anchor-id="matrix-multiplication">
+<span class="header-section-number">21.1</span> Matrix multiplication</h2>
+<p>A commonly used operation in data analysis is matrix multiplication. Here, we define and motivate the operation.</p>
+<p>Linear algebra originated from mathematicians developing systematic ways to solve systems of linear equations. For example:</p>
 <p><span class="math display">\[
-\begin{align}
+\begin{aligned}
 x +  3 y  - 2 z  &amp;= 5\\
 3x + 5y + 6z &amp;= 7\\
 2x + 4y + 3z &amp;= 8.
-\end{align}
+\end{aligned}
 \]</span></p>
 <p>Mathematicians figured out that by representing these linear systems of equations using matrices and vectors, predefined algorithms could be designed to solve any system of linear equations. A basic linear algebra class will teach some of these algorithms, such as Gaussian elimination, the Gauss-Jordan elimination, and the LU and QR decompositions. These methods are usually covered in detail in university level linear algebra courses.</p>
-<p>To explain matrix multiplication, define two matrices <span class="math inline">\(\mathbf{A}\)</span> and <span class="math inline">\(\mathbf{B}\)</span></p>
+<p>To explain matrix multiplication, define two matrices: <span class="math inline">\(\mathbf{A}\)</span> and <span class="math inline">\(\mathbf{B}\)</span></p>
 <p><span class="math display">\[
 \mathbf{A} =
 \begin{pmatrix}
@@ -450,7 +456,7 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
 b_{n1}&amp;b_{n2}&amp;\dots&amp;b_{np}
 \end{pmatrix}
 \]</span></p>
-<p>and define the product of matrices <span class="math inline">\(\mathbf{A}\)</span> and <span class="math inline">\(\mathbf{B}\)</span> as the matrix <span class="math inline">\(\mathbf{C} = \mathbf{A}\mathbf{B}\)</span> that has entries <span class="math inline">\(c_{ij}\)</span> equal to the sum of the component-wise product of the <span class="math inline">\(i\)</span>th row of <span class="math inline">\(\mathbf{A}\)</span> with the <span class="math inline">\(j\)</span>th column of <span class="math inline">\(\mathbf{B}\)</span>. Using R code we can define <span class="math inline">\(\mathbf{C}= \mathbf{A}\mathbf{B}\)</span> as follows:</p>
+<p>and define the product of matrices <span class="math inline">\(\mathbf{A}\)</span> and <span class="math inline">\(\mathbf{B}\)</span> as the matrix <span class="math inline">\(\mathbf{C} = \mathbf{A}\mathbf{B}\)</span> that has entries <span class="math inline">\(c_{ij}\)</span> equal to the sum of the component-wise product of the <span class="math inline">\(i\)</span>th row of <span class="math inline">\(\mathbf{A}\)</span> with the <span class="math inline">\(j\)</span>th column of <span class="math inline">\(\mathbf{B}\)</span>. Using R code, we can define <span class="math inline">\(\mathbf{C}= \mathbf{A}\mathbf{B}\)</span> as follows:</p>
 <div class="cell" data-layout-align="center" data-hash="linear-algebra_cache/html/unnamed-chunk-2_246e70e48f43f2fb10f52e6169c545d7">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">m</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">A</span><span class="op">)</span></span>
 <span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">B</span><span class="op">)</span></span>
@@ -486,12 +492,12 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
 <p>Note this definition implies that the multiplication <span class="math inline">\(\mathbf{A}\mathbf{B}\)</span> is only possible when the number of rows of <span class="math inline">\(\mathbf{A}\)</span> matches the number of columns of <span class="math inline">\(\mathbf{B}\)</span>.</p>
 <p>So how does this definition of matrix multiplication help solve systems of equations? First, any system of equations with unknowns <span class="math inline">\(x_1, \dots x_n\)</span></p>
 <p><span class="math display">\[
-\begin{align}
+\begin{aligned}
 a_{11} x_1 + a_{12} x_2 \dots + a_{1n}x_n &amp;= b_1\\
 a_{21} x_1 + a_{22} x_2 \dots + a_{2n}x_n &amp;= b_2\\
 \vdots\\
 a_{n1} x_1 + a_{n2} x_2 \dots + a_{nn}x_n &amp;= b_n\\
-\end{align}
+\end{aligned}
 \]</span></p>
 <p>can now be represented as matrix multiplication by defining the following matrices:</p>
 <p><span class="math display">\[
@@ -518,7 +524,7 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
 x_n
 \end{pmatrix}
 \]</span></p>
-<p>and rewriting the equation simply as</p>
+<p>and rewriting the equation simply as:</p>
 <p><span class="math display">\[
 \mathbf{A}\mathbf{x} =  \mathbf{b}
 \]</span></p>
@@ -526,18 +532,29 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
 <p><span class="math display">\[
 \mathbf{A}^{-1}\mathbf{A}\mathbf{x} =   \mathbf{x} = \mathbf{A}^{-1} \mathbf{b}
 \]</span></p>
-<p>To solve the first equation we wrote out in R, we can use the function <code>solve</code>:</p>
+<p>To solve the first equation we wrote out in R, we can use the function <code>qr.solve</code>:</p>
 <div class="cell" data-layout-align="center" data-hash="linear-algebra_cache/html/unnamed-chunk-4_dbc22ce937486943323ca4ebd4369d67">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">A</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">3</span>, <span class="op">-</span><span class="fl">2</span>, <span class="fl">3</span>, <span class="fl">5</span>, <span class="fl">6</span>, <span class="fl">2</span>, <span class="fl">4</span>, <span class="fl">3</span><span class="op">)</span>, <span class="fl">3</span>, <span class="fl">3</span>, byrow <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
 <span><span class="va">b</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">5</span>, <span class="fl">7</span>, <span class="fl">8</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/solve.html">solve</a></span><span class="op">(</span><span class="va">A</span>, <span class="va">b</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="the-identity-matrix" class="level2" data-number="20.2"><h2 data-number="20.2" class="anchored" data-anchor-id="the-identity-matrix">
-<span class="header-section-number">20.2</span> The identity matrix</h2>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>The function <code>solve</code> works well when dealing with small to medium-sized matrices with a similar range for each column and not too many 0s. The function <code>qr.solve</code> can be used when this is not the case.</p>
+</div>
+</div>
+</div>
+</section><section id="the-identity-matrix" class="level2" data-number="21.2"><h2 data-number="21.2" class="anchored" data-anchor-id="the-identity-matrix">
+<span class="header-section-number">21.2</span> The identity matrix</h2>
 <p>The identity matrix, represented with a bold <span class="math inline">\(\mathbf{I}\)</span>, is like the number 1, but for matrices: if you multiply a matrix by the identity matrix, you get back the matrix.</p>
 <p><span class="math display">\[
-\mathbf{I}\mathbf{x} = \mathbf{x}
-\]</span> If you do some math with the definition of matrix multiplication you will realize that <span class="math inline">\(\mathbf{1}\)</span> is a matrix with the same number of rows and columns (refereed to as square matrix) with 0s everywhere except the diagonal:</p>
+\mathbf{I}\mathbf{X} = \mathbf{X}
+\]</span></p>
+<p>If you define <span class="math inline">\(\mathbf{I}\)</span> as matrix with the same number of rows and columns (referred to as square matrix) with 0s everywhere except the diagonal:</p>
 <p><span class="math display">\[
 \mathbf{I}=\begin{pmatrix}
 1&amp;0&amp;\dots&amp;0\\
@@ -545,7 +562,9 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
 \vdots&amp;\vdots&amp;\ddots&amp;\vdots\\
 0&amp;0&amp;\dots&amp;1
 \end{pmatrix}
-\]</span> It also implies that due to the definition of an inverse matrix we have</p>
+\]</span></p>
+<p>you will obtain the desired property.</p>
+<p>Note that the definition of an inverse matrix implies that:</p>
 <p><span class="math display">\[
 \mathbf{A}^{-1}\mathbf{A} = \mathbf{1}
 \]</span></p>
@@ -553,10 +572,10 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
 <div class="cell" data-layout-align="center" data-hash="linear-algebra_cache/html/unnamed-chunk-5_beb61d77cf65fe7a2249f448376696a0">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/solve.html">solve</a></span><span class="op">(</span><span class="va">A</span><span class="op">)</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="va">b</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="distance" class="level2" data-number="20.3"><h2 data-number="20.3" class="anchored" data-anchor-id="distance">
-<span class="header-section-number">20.3</span> Distance</h2>
+</section><section id="distance" class="level2" data-number="21.3"><h2 data-number="21.3" class="anchored" data-anchor-id="distance">
+<span class="header-section-number">21.3</span> Distance</h2>
 <p>Many of the analyses we perform with high-dimensional data relate directly or indirectly to distance. For example, most machine learning techniques rely on being able to define distances between observations, using features or predictors. Clustering algorithms, for example, search of observations that are <em>similar</em>. But what does this mean mathematically?</p>
-<p>To define distance, we introduce another linear algebra concept: the <em>norm</em>. Recall that a point in two dimensions can represented in polar coordinates as:</p>
+<p>To define distance, we introduce another linear algebra concept: the <em>norm</em>. Recall that a point in two dimensions can be represented in polar coordinates as:</p>
 <div class="cell" data-layout-align="center" data-fig.asp="0.7" data-hash="linear-algebra_cache/html/unnamed-chunk-6_83a0e7172c194f01666baf7c00c439f9">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -565,7 +584,7 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
 </div>
 </div>
 </div>
-<p>with <span class="math inline">\(\theta = \arctan{\frac{x2}{x1}}\)</span> and <span class="math inline">\(r = \sqrt{x_1^2 + x_2^2}\)</span>. If we think of the point as two dimensional column vector <span class="math inline">\(\mathbf{x} = (x_1, x_2)^\top\)</span>, <span class="math inline">\(r\)</span> defines the norm of <span class="math inline">\(\mathbf{x}\)</span>. The norm can be thought of as the <em>size</em> of the two-dimensional vector disregarding the direction: if we change the angle, the vector changes but the size does not. The point of defining the norm is that we can extrapolated the concept of <em>size</em> to higher dimensions. Specifically, we write the norm for any vector <span class="math inline">\(\mathbf{x}\)</span> as:</p>
+<p>with <span class="math inline">\(\theta = \arctan{\frac{x2}{x1}}\)</span> and <span class="math inline">\(r = \sqrt{x_1^2 + x_2^2}\)</span>. If we think of the point as two dimensional column vector <span class="math inline">\(\mathbf{x} = (x_1, x_2)^\top\)</span>, <span class="math inline">\(r\)</span> defines the norm of <span class="math inline">\(\mathbf{x}\)</span>. The norm can be thought of as the <em>size</em> of the two-dimensional vector disregarding the direction: if we change the angle, the vector changes but the size does not. The point of defining the norm is that we can extrapolate the concept of <em>size</em> to higher dimensions. Specifically, we write the norm for any vector <span class="math inline">\(\mathbf{x}\)</span> as:</p>
 <p><span class="math display">\[
 ||\mathbf{x}|| = \sqrt{x_1^2 + x_2^2 + \dots + x_p^2}
 \]</span></p>
@@ -573,7 +592,7 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
 <p><span class="math display">\[
 ||\mathbf{x}||^2 = \mathbf{x}^\top\mathbf{x}
 \]</span></p>
-<p>To define distance, suppose we have two two-dimensional points <span class="math inline">\(\mathbf{x}_1\)</span> and <span class="math inline">\(\mathbf{x}_2\)</span>. We can define how similar they are by simply using euclidean distance.</p>
+<p>To define distance, suppose we have two two-dimensional points: <span class="math inline">\(\mathbf{x}_1\)</span> and <span class="math inline">\(\mathbf{x}_2\)</span>. We can define how similar they are by simply using euclidean distance:</p>
 <div class="cell" data-layout-align="center" data-fig.asp="0.7" data-hash="linear-algebra_cache/html/unnamed-chunk-7_9550ed7232707ad5ebfd513fc2a8839a">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -605,13 +624,13 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="op">(</span><span class="va">x_1</span> <span class="op">-</span> <span class="va">x_2</span><span class="op">)</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="op">(</span><span class="va">x_1</span> <span class="op">-</span> <span class="va">x_3</span><span class="op">)</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="op">(</span><span class="va">x_2</span> <span class="op">-</span> <span class="va">x_3</span><span class="op">)</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 2320 2331 2519</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>In R, the function <code>crossprod(x)</code> is convenient for computing norms it multiplies <code>t(x)</code> by <code>x</code></p>
+<p>In R, the function <code>crossprod(x)</code> is convenient for computing norms. It multiplies <code>t(x)</code> by <code>x</code>:</p>
 <div class="cell" data-layout-align="center" data-hash="linear-algebra_cache/html/unnamed-chunk-10_9b1fb5ce878003791921aaf6b49b7f8a">
 <div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/crossprod.html">crossprod</a></span><span class="op">(</span><span class="va">x_1</span> <span class="op">-</span> <span class="va">x_2</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/crossprod.html">crossprod</a></span><span class="op">(</span><span class="va">x_1</span> <span class="op">-</span> <span class="va">x_3</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/crossprod.html">crossprod</a></span><span class="op">(</span><span class="va">x_2</span> <span class="op">-</span> <span class="va">x_3</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 2320 2331 2519</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Note <code>crossprod</code> takes a matrix as the first argument and therefore the vectors used here are being coerced into single column matrices. Also note that <code>crossprod(x,y)</code> multiples <code>t(x)</code> by <code>y</code>.</p>
-<p>We can see that the distance is smaller between the first two. This is in agreement with the fact that the first two are 2s and the third is a 7.</p>
+<p>Note <code>crossprod</code> takes a matrix as the first argument. As a result, the vectors used here are being coerced into single column matrices. Also, note that <code>crossprod(x,y)</code> multiples <code>t(x)</code> by <code>y</code>.</p>
+<p>We can see that the distance is smaller between the first two. This agrees with the fact that the first two are 2s and the third is a 7.</p>
 <div class="cell" data-layout-align="center" data-hash="linear-algebra_cache/html/unnamed-chunk-11_437cc853efc62bac74dc89af37c5499b">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">6</span>, <span class="fl">17</span>, <span class="fl">16</span><span class="op">)</span><span class="op">]</span></span>
 <span><span class="co">#&gt; [1] 2 2 7</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -629,7 +648,7 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
 <span><span class="co">#&gt; 2 2320     </span></span>
 <span><span class="co">#&gt; 3 2331 2519</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can quickly see an image of the distances between observations using this function. As an example, we compute the distance between each of the first 300 observations and then make an image:</p>
+<p>The <code>image</code> function allows us to quickly see an image of distances between observations. As an example, we compute the distance between each of the first 300 observations and then make an image:</p>
 <div class="cell" data-layout-align="center" data-hash="linear-algebra_cache/html/distance-image_bf070589259976dbbab5e83e3e5768d8">
 <div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">d</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">300</span>,<span class="op">]</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">as.matrix</a></span><span class="op">(</span><span class="va">d</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -646,19 +665,19 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
 </div>
 </div>
 </div>
-</section><section id="sec-predictor-space" class="level2" data-number="20.4"><h2 data-number="20.4" class="anchored" data-anchor-id="sec-predictor-space">
-<span class="header-section-number">20.4</span> Spaces</h2>
+</section><section id="sec-predictor-space" class="level2" data-number="21.4"><h2 data-number="21.4" class="anchored" data-anchor-id="sec-predictor-space">
+<span class="header-section-number">21.4</span> Spaces</h2>
 <p><em>Predictor space</em> is a concept that is often used to describe machine learning algorithms. The term <em>space</em> refers to an advanced mathematical definition for which we provide a simplified explanation to help understand the term predictor space when used in the context of machine learning algorithms.</p>
 <p>We can think of all predictors <span class="math inline">\((x_{i,1}, \dots, x_{i,p})^\top\)</span> for all observations <span class="math inline">\(i=1,\dots,n\)</span> as <span class="math inline">\(n\)</span> <span class="math inline">\(p\)</span>-dimensional points. A <em>space</em> can be thought of as the collection of all possible points that should be considered for the data analysis in question. This includes points we could see, but have not been observed yet. In the case of the handwritten digits, we can think of the predictor space as any point <span class="math inline">\((x_{1}, \dots, x_{p})^\top\)</span> as long as each entry <span class="math inline">\(x_i, \, i = 1, \dots, p\)</span> is between 0 and 255.</p>
-<p>Some Machine Learning algorithms also define subspaces. A common approach is to define neighborhoods of points that are close to a <em>center</em>. We can do this by selecting a center <span class="math inline">\(\mathbf{x}_0\)</span>, a minimum distance <span class="math inline">\(r\)</span>, and defining the subspace as the collection of points <span class="math inline">\(\mathbf{x}\)</span> that satisfy</p>
+<p>Some Machine Learning algorithms also define subspaces. A commonly defined subspace in machine learning are <em>neighborhoods</em> composed of points that are close to a predetermined <em>center</em>. We do this by selecting a center <span class="math inline">\(\mathbf{x}_0\)</span>, a minimum distance <span class="math inline">\(r\)</span>, and defining the subspace as the collection of points <span class="math inline">\(\mathbf{x}\)</span> that satisfy:</p>
 <p><span class="math display">\[
 || \mathbf{x} - \mathbf{x}_0 || \leq r.
 \]</span></p>
 <p>We can think of this subspace as a multidimensional sphere since every point is the same distance away from the center.</p>
-<p>Other machine learning algorithms partition the predictor space into non-overlapping regions and then make different predictions for each region using the data in the region. We will learn about these in <a href="../ml/algorithms.html#sec-trees"><span>Section&nbsp;29.5</span></a>.</p>
-</section><section id="exercises" class="level2" data-number="20.5"><h2 data-number="20.5" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">20.5</span> Exercises</h2>
-<p>1. Generate two matrix, <code>A</code> and <code>B</code>, containing randomly generated and normally distributed numbers. The dimensions of these two matrices should <span class="math inline">\(4 \times 3\)</span> and <span class="math inline">\(3 \times 6\)</span>, respectively. Confirm that <code>C &lt;- A %*% B</code> produce the same results as:</p>
+<p>Other machine learning algorithms partition the predictor space into non-overlapping regions and then make different predictions for each region using the data in the region. We will learn about these in <a href="../ml/algorithms.html#sec-trees"><span>Section&nbsp;30.4</span></a>.</p>
+</section><section id="exercises" class="level2" data-number="21.5"><h2 data-number="21.5" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">21.5</span> Exercises</h2>
+<p>1. Generate two matrix, <code>A</code> and <code>B</code>, containing randomly generated and normally distributed numbers. The dimensions of these two matrices should be <span class="math inline">\(4 \times 3\)</span> and <span class="math inline">\(3 \times 6\)</span>, respectively. Confirm that <code>C &lt;- A %*% B</code> produces the same results as:</p>
 <div class="cell" data-layout-align="center" data-hash="linear-algebra_cache/html/unnamed-chunk-16_41a156ed9162f9bcf792c2bf3bb80eb0">
 <div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">m</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">A</span><span class="op">)</span></span>
 <span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">B</span><span class="op">)</span></span>
@@ -671,26 +690,26 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
 </div>
 <p>2. Solve the following system of equations using R:</p>
 <p><span class="math display">\[
-\begin{align}
+\begin{aligned}
 x + y + z + w &amp;= 10\\
 2x + 3y - z - w &amp;= 5\\
 3x - y + 4z - 2w &amp;= 15\\
 2x + 2y - 2z - 2w &amp;= 20\\
-\end{align}
+\end{aligned}
 \]</span></p>
-<p>3. Define <code>x</code></p>
+<p>3. Define <code>x</code>:</p>
 <div class="cell" data-layout-align="center" data-hash="linear-algebra_cache/html/unnamed-chunk-17_b447e7725278e49c83a7b7b0f7ed2ca0">
 <div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mnist</span> <span class="op">&lt;-</span> <span class="fu">read_mnist</span><span class="op">(</span><span class="op">)</span></span>
 <span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">images</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">300</span>,<span class="op">]</span> </span>
 <span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">labels</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">300</span><span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>and compute the distance matrix</p>
+<p>and compute the distance matrix:</p>
 <div class="cell" data-layout-align="center" data-hash="linear-algebra_cache/html/unnamed-chunk-18_78b8df1aac8d6a49c524f9c1d6ab21b9">
 <div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">d</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/class.html">class</a></span><span class="op">(</span><span class="va">d</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Generate a boxplot showing the distances for the second row of <code>d</code> stratified by digits. Do not include the distance to itself which we know it is 0. Can you predict what digit is represented by the second row of <code>x</code>?</p>
-<p>4. Use the <code>apply</code> function and matrix algebra to compute the distance between the second digit <code>mnist$train$images[4,]</code> and all other digits represented in <code>mnist$train$images</code>. Then generate as boxplot as in exercise 2 and predict what digit is the fourth row.</p>
+<p>Generate a boxplot showing the distances for the second row of <code>d</code> stratified by digits. Do not include the distance to itself, which we know is 0. Can you predict what digit is represented by the second row of <code>x</code>?</p>
+<p>4. Use the <code>apply</code> function and matrix algebra to compute the distance between the second digit <code>mnist$train$images[4,]</code> and all other digits represented in <code>mnist$train$images</code>. Then generate a boxplot as in exercise 2 and predict what digit is the fourth row.</p>
 <p>5. Compute the distance between each feature and the feature representing the middle pixel (row 14 column 14). Create an image plot of where the distance is shown with color in the pixel position.</p>
 
 
@@ -928,12 +947,12 @@ <h1 class="title"><span id="sec-matrix-algebra" class="quarto-section-identifier
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../highdim/matrices-in-R.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../highdim/dimension-reduction.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/highdim/linear-algebra_files/figure-html/unnamed-chunk-7-1.png b/docs/highdim/linear-algebra_files/figure-html/unnamed-chunk-7-1.png
index d3dd184..9bc8450 100644
Binary files a/docs/highdim/linear-algebra_files/figure-html/unnamed-chunk-7-1.png and b/docs/highdim/linear-algebra_files/figure-html/unnamed-chunk-7-1.png differ
diff --git a/docs/highdim/linear-algebra_files/figure-html/unnamed-chunk-8-1.png b/docs/highdim/linear-algebra_files/figure-html/unnamed-chunk-8-1.png
deleted file mode 100644
index 9bc8450..0000000
Binary files a/docs/highdim/linear-algebra_files/figure-html/unnamed-chunk-8-1.png and /dev/null differ
diff --git a/docs/highdim/matrices-in-R.html b/docs/highdim/matrices-in-R.html
index 55c00ec..a9474cb 100644
--- a/docs/highdim/matrices-in-R.html
+++ b/docs/highdim/matrices-in-R.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 19&nbsp; Matrices in R</title>
+<title>Advanced Data Science - 20&nbsp; Matrices in R</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../highdim/intro-highdim.html">High dimensional data</a></li><li class="breadcrumb-item"><a href="../highdim/matrices-in-R.html"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../highdim/intro-highdim.html">High dimensional data</a></li><li class="breadcrumb-item"><a href="../highdim/matrices-in-R.html"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -400,53 +406,53 @@
    
   <ul>
 <li>
-<a href="#sec-mnist" id="toc-sec-mnist" class="nav-link active" data-scroll-target="#sec-mnist"><span class="header-section-number">19.1</span> Case study: MNIST</a>
+<a href="#sec-mnist" id="toc-sec-mnist" class="nav-link active" data-scroll-target="#sec-mnist"><span class="header-section-number">20.1</span> Case study: MNIST</a>
   <ul class="collapse">
-<li><a href="#motivating-tasks" id="toc-motivating-tasks" class="nav-link" data-scroll-target="#motivating-tasks"><span class="header-section-number">19.1.1</span> Motivating tasks</a></li>
+<li><a href="#motivating-tasks" id="toc-motivating-tasks" class="nav-link" data-scroll-target="#motivating-tasks"><span class="header-section-number">20.1.1</span> Motivating tasks</a></li>
   </ul>
 </li>
-  <li><a href="#dimensions-of-a-matrix" id="toc-dimensions-of-a-matrix" class="nav-link" data-scroll-target="#dimensions-of-a-matrix"><span class="header-section-number">19.2</span> Dimensions of a matrix</a></li>
-  <li><a href="#creating-a-a-matrix" id="toc-creating-a-a-matrix" class="nav-link" data-scroll-target="#creating-a-a-matrix"><span class="header-section-number">19.3</span> Creating a a matrix</a></li>
+  <li><a href="#dimensions-of-a-matrix" id="toc-dimensions-of-a-matrix" class="nav-link" data-scroll-target="#dimensions-of-a-matrix"><span class="header-section-number">20.2</span> Dimensions of a matrix</a></li>
+  <li><a href="#creating-a-matrix" id="toc-creating-a-matrix" class="nav-link" data-scroll-target="#creating-a-matrix"><span class="header-section-number">20.3</span> Creating a matrix</a></li>
   <li>
-<a href="#subsetting" id="toc-subsetting" class="nav-link" data-scroll-target="#subsetting"><span class="header-section-number">19.4</span> Subsetting</a>
+<a href="#subsetting" id="toc-subsetting" class="nav-link" data-scroll-target="#subsetting"><span class="header-section-number">20.4</span> Subsetting</a>
   <ul class="collapse">
 <li><a href="#task-1-visualize-the-original-image" id="toc-task-1-visualize-the-original-image" class="nav-link" data-scroll-target="#task-1-visualize-the-original-image">Task 1: Visualize the original image</a></li>
   </ul>
 </li>
-  <li><a href="#sec-matrix-notation" id="toc-sec-matrix-notation" class="nav-link" data-scroll-target="#sec-matrix-notation"><span class="header-section-number">19.5</span> Mathematical notation</a></li>
-  <li><a href="#the-transpose" id="toc-the-transpose" class="nav-link" data-scroll-target="#the-transpose"><span class="header-section-number">19.6</span> The transpose</a></li>
+  <li><a href="#sec-matrix-notation" id="toc-sec-matrix-notation" class="nav-link" data-scroll-target="#sec-matrix-notation"><span class="header-section-number">20.5</span> Mathematical notation</a></li>
+  <li><a href="#the-transpose" id="toc-the-transpose" class="nav-link" data-scroll-target="#the-transpose"><span class="header-section-number">20.6</span> The transpose</a></li>
   <li>
-<a href="#row-and-column-summaries" id="toc-row-and-column-summaries" class="nav-link" data-scroll-target="#row-and-column-summaries"><span class="header-section-number">19.7</span> Row and column summaries</a>
+<a href="#row-and-column-summaries" id="toc-row-and-column-summaries" class="nav-link" data-scroll-target="#row-and-column-summaries"><span class="header-section-number">20.7</span> Row and column summaries</a>
   <ul class="collapse">
 <li><a href="#task-2-do-some-digits-require-more-ink-to-write-than-others" id="toc-task-2-do-some-digits-require-more-ink-to-write-than-others" class="nav-link" data-scroll-target="#task-2-do-some-digits-require-more-ink-to-write-than-others">Task 2: Do some digits require more ink to write than others?</a></li>
   </ul>
 </li>
   <li>
-<a href="#conditional-filtering" id="toc-conditional-filtering" class="nav-link" data-scroll-target="#conditional-filtering"><span class="header-section-number">19.8</span> Conditional filtering</a>
+<a href="#conditional-filtering" id="toc-conditional-filtering" class="nav-link" data-scroll-target="#conditional-filtering"><span class="header-section-number">20.8</span> Conditional filtering</a>
   <ul class="collapse">
 <li><a href="#task-3-are-some-pixels-uninformative" id="toc-task-3-are-some-pixels-uninformative" class="nav-link" data-scroll-target="#task-3-are-some-pixels-uninformative">Task 3: Are some pixels uninformative?</a></li>
   </ul>
 </li>
   <li>
-<a href="#indexing-with-matrices" id="toc-indexing-with-matrices" class="nav-link" data-scroll-target="#indexing-with-matrices"><span class="header-section-number">19.9</span> Indexing with matrices</a>
+<a href="#indexing-with-matrices" id="toc-indexing-with-matrices" class="nav-link" data-scroll-target="#indexing-with-matrices"><span class="header-section-number">20.9</span> Indexing with matrices</a>
   <ul class="collapse">
-<li><a href="#task-4-can-we-remove-smudges" id="toc-task-4-can-we-remove-smudges" class="nav-link" data-scroll-target="#task-4-can-we-remove-smudges"><span class="header-section-number">19.9.1</span> Task 4: Can we remove smudges?</a></li>
+<li><a href="#task-4-can-we-remove-smudges" id="toc-task-4-can-we-remove-smudges" class="nav-link" data-scroll-target="#task-4-can-we-remove-smudges"><span class="header-section-number">20.9.1</span> Task 4: Can we remove smudges?</a></li>
   <li><a href="#task-5-binarizing-the-data" id="toc-task-5-binarizing-the-data" class="nav-link" data-scroll-target="#task-5-binarizing-the-data">Task 5: Binarizing the data</a></li>
   </ul>
 </li>
   <li>
-<a href="#vectorization-for-matrices" id="toc-vectorization-for-matrices" class="nav-link" data-scroll-target="#vectorization-for-matrices"><span class="header-section-number">19.10</span> Vectorization for matrices</a>
+<a href="#vectorization-for-matrices" id="toc-vectorization-for-matrices" class="nav-link" data-scroll-target="#vectorization-for-matrices"><span class="header-section-number">20.10</span> Vectorization for matrices</a>
   <ul class="collapse">
 <li><a href="#task-6-standardize-the-digits" id="toc-task-6-standardize-the-digits" class="nav-link" data-scroll-target="#task-6-standardize-the-digits">Task 6: Standardize the digits</a></li>
   </ul>
 </li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">19.11</span> Exercises</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">20.11</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/highdim/matrices-in-R.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
 <h1 class="title">
-<span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span>
+<span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span>
 </h1>
 </div>
 
@@ -460,13 +466,13 @@ <h1 class="title">
   </div>
   
 
-</header><p>When the number of variables associated with each observation is large and they can all be represented as a number, it is often more convenient to store them in a matrix and perform the analysis with linear algebra operations, rather than storing in a data frame and performing the analysis with <strong>tidyverse</strong> or <strong>data.table</strong> functions. With matrices, variables for each observation are stored in a row, resulting in a matrix with as many columns as variables. In statistics we refer to values represented in the rows of the matrix as the <em>covariates</em> or <em>pedictors</em> and in machine learning we refer to them as the <em>features</em>.</p>
-<p>In linear algebra we have three types of objects: scalars, vectors, and matrices. We have already learned about vectors in R, and, although there is no data type for scalars, we can represent them as vectors of length 1. In this chapter we learn how to work with matrices in R and relate them to linear algebra notation and concepts.</p>
-<section id="sec-mnist" class="level2" data-number="19.1"><h2 data-number="19.1" class="anchored" data-anchor-id="sec-mnist">
-<span class="header-section-number">19.1</span> Case study: MNIST</h2>
-<p>An example comes from handwritten digits. The first step in handling mail received in the post office is sorting letters by zip code:</p>
+</header><p>When the number of variables associated with each observation is large and they can all be represented as a number, it is often more convenient to store them in a matrix and perform the analysis with linear algebra operations, rather than storing them in a data frame and performing the analysis with <strong>tidyverse</strong> or <strong>data.table</strong> functions. With matrices, variables for each observation are stored in a row, resulting in a matrix with as many columns as variables. In statistics, we refer to values represented in the rows of the matrix as the <em>covariates</em> or <em>predictors</em> and, in machine learning, we refer to them as the <em>features</em>.</p>
+<p>In linear algebra, we have three types of objects: scalars, vectors, and matrices. We have already learned about vectors in R, and, although there is no data type for scalars, we can represent them as vectors of length 1. In this chapter, we learn how to work with matrices in R and relate them to linear algebra notation and concepts.</p>
+<section id="sec-mnist" class="level2" data-number="20.1"><h2 data-number="20.1" class="anchored" data-anchor-id="sec-mnist">
+<span class="header-section-number">20.1</span> Case study: MNIST</h2>
+<p>The first step in handling mail received in the post office is to sort letters by zip code:</p>
 <p><img src="../ml/img/how-to-write-a-address-on-an-envelope-how-to-write-the-address-on-an-envelope-write-address-on-envelope-india-finishedenvelope-x69070.png" class="img-fluid" style="display:block; margin:auto;;width:40.0%"></p>
-<p>In the Machine Learning part of this book we will describe how we can build computer algorithms to read handwritten digits, which robots then use to sort the letters. To build these algorithms, we first need to collect data, which in this case is a high-dimensional dataset.</p>
+<p>In the Machine Learning part of this book, we will describe how we can build computer algorithms to read handwritten digits, which robots then use to sort the letters. To do this, we first need to collect data, which in this case is a high-dimensional dataset and best stored in a matrix.</p>
 <p>The MNIST dataset was generated by digitizing thousands of handwritten digits, already read and annotated by humans<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>. Below are three images of written digits.</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/digit-images-example_9f085e1eb1d6649e1b9f5eb538b783c1">
 <div class="cell-output-display">
@@ -485,7 +491,7 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>For each digitized image, indexed by <span class="math inline">\(i\)</span>, we are provided 784 variables and a categorical outcome, or <em>label</em>, representing which digit among <span class="math inline">\(0, 1, 2, 3, 4, 5, 6, 7 , 8,\)</span> and <span class="math inline">\(9\)</span> the image is representing. Let’s load the data using the <strong>dslabs</strong> package:</p>
+<p>For each digitized image, indexed by <span class="math inline">\(i\)</span>, we are provided with 784 variables and a categorical outcome, or <em>label</em>, representing the digit among <span class="math inline">\(0, 1, 2, 3, 4, 5, 6, 7 , 8,\)</span> and <span class="math inline">\(9\)</span> that the image is representing. Let’s load the data using the <strong>dslabs</strong> package:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-2_6ddde452ddd9f93d433ffe143dc5c658">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
 <span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
@@ -503,14 +509,14 @@ <h1 class="title">
 <span><span class="co">#&gt;    0    1    2    3    4    5    6    7    8    9 </span></span>
 <span><span class="co">#&gt; 5923 6742 5958 6131 5842 5421 5918 6265 5851 5949</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<section id="motivating-tasks" class="level3" data-number="19.1.1"><h3 data-number="19.1.1" class="anchored" data-anchor-id="motivating-tasks">
-<span class="header-section-number">19.1.1</span> Motivating tasks</h3>
+<section id="motivating-tasks" class="level3" data-number="20.1.1"><h3 data-number="20.1.1" class="anchored" data-anchor-id="motivating-tasks">
+<span class="header-section-number">20.1.1</span> Motivating tasks</h3>
 <p>To motivate the use of matrices in R, we will pose six tasks related to the handwritten digits data and then show the fast and simple code that solves them.</p>
-<p>1. Visualize the original image. The pixel intensities are provided as rows in a matrix. We will show how to conver these to a matrix that we can visualize.</p>
+<p>1. Visualize the original image. The pixel intensities are provided as rows in a matrix. We will show how to convert these to a matrix that we can visualize.</p>
 <p>2. Do some digits require more ink to write than others? We will study the distribution of the total pixel darkness and how it varies by digits.</p>
 <p>3. Are some pixels uninformative? We will study the variation of each pixel across digits and remove predictors (columns) associated with pixels that don’t change much and thus can’t provide much information for classification.</p>
-<p>4. Can we remove smudges? We will first, look at the distribution of all pixel values. Then we will use this to pick a cutoff to define unwritten space. Then, set anything below that cutoff to 0.</p>
-<p>5. Binarize the data. First, we will look at the distribution of all pixel values. We will then use this to pick a cutoff to distinguish between writing and no writing. Then, we will convert all entries into either 1 or 0.</p>
+<p>4. Can we remove smudges? We will first look at the distribution of all pixel values. Next, we will use this to pick a cutoff to define unwritten space. Then, we set anything below that cutoff to 0.</p>
+<p>5. Binarize the data. First, we will look at the distribution of all pixel values. We will then use this to pick a cutoff to distinguish between writing and no writing. Afterward, we will convert all entries into either 1 or 0.</p>
 <p>6. Standardize the digits. We will scale each of the predictors in each entry to have the same average and standard deviation.</p>
 <p>To complete these, we will have to perform mathematical operations involving several variables. The <strong>tidyverse</strong> or <strong>data.table</strong> are not developed to perform these types of mathematical operations. For this task, it is convenient to use matrices.</p>
 <p>To simplify the code below, we will rename these <code>x</code> and <code>y</code> respectively:</p>
@@ -518,10 +524,10 @@ <h1 class="title">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">images</span></span>
 <span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">labels</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section></section><section id="dimensions-of-a-matrix" class="level2" data-number="19.2"><h2 data-number="19.2" class="anchored" data-anchor-id="dimensions-of-a-matrix">
-<span class="header-section-number">19.2</span> Dimensions of a matrix</h2>
-<p>The dimension of a matrix isan important characteristic needed to assure that certain linear algebra operations can be performed. The dimension, is a two-number summary defined as the number of rows <span class="math inline">\(\times\)</span> the number of columns.</p>
-<p>The <code>nrow</code> function tells us how how many rows tha matrix has:</p>
+</section></section><section id="dimensions-of-a-matrix" class="level2" data-number="20.2"><h2 data-number="20.2" class="anchored" data-anchor-id="dimensions-of-a-matrix">
+<span class="header-section-number">20.2</span> Dimensions of a matrix</h2>
+<p>The dimension of a matrix is an important characteristic needed to assure that certain linear algebra operations can be performed. The dimension is a two-number summary defined as the number of rows <span class="math inline">\(\times\)</span> the number of columns.</p>
+<p>The <code>nrow</code> function tells us how many rows that matrix has:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-6_e2225a69b9d899fb173bf660ba387cb3">
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 60000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -537,9 +543,9 @@ <h1 class="title">
 <div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 60000   784</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="creating-a-a-matrix" class="level2" data-number="19.3"><h2 data-number="19.3" class="anchored" data-anchor-id="creating-a-a-matrix">
-<span class="header-section-number">19.3</span> Creating a a matrix</h2>
-<p>In R we can create a matrix using the <code>matrix</code> function. The first argument is a vector containing the elements that will fill up the matrix. The second and third arguments determine the number of row and columns, respectively. So a typical way to create a matrix is to first obtain a vector of numbers containing the elements of the matrix and feeding it to the <code>matrix</code> function. For example, to create a <span class="math inline">\(100 \times 2\)</span> matrix of normally distributed random variables we write:</p>
+</section><section id="creating-a-matrix" class="level2" data-number="20.3"><h2 data-number="20.3" class="anchored" data-anchor-id="creating-a-matrix">
+<span class="header-section-number">20.3</span> Creating a matrix</h2>
+<p>In R, we can create a matrix using the <code>matrix</code> function. The first argument is a vector containing the elements that will fill up the matrix. The second and third arguments determine the number of row and columns, respectively. So a typical way to create a matrix is to first obtain a vector of numbers containing the elements of the matrix and feeding it to the <code>matrix</code> function. For example, to create a <span class="math inline">\(100 \times 2\)</span> matrix of normally distributed random variables, we write:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-9_a9f0c7c7dc2d2aaf5a07c2921fdfd45d">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="fl">100</span><span class="op">*</span><span class="fl">2</span><span class="op">)</span>, <span class="fl">100</span>, <span class="fl">2</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
@@ -551,7 +557,7 @@ <h1 class="title">
 <span><span class="co">#&gt; [2,]    2    5    8   11   14</span></span>
 <span><span class="co">#&gt; [3,]    3    6    9   12   15</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To fill the matrix row by row we can use <code>byrow</code> argument:</p>
+<p>To fill the matrix row by row, we can use the <code>byrow</code> argument:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-11_a6b9786d1854957cd4c5adbee112fc6f">
 <div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">15</span>, <span class="fl">3</span>, <span class="fl">5</span>, byrow <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
 <span><span class="co">#&gt;      [,1] [,2] [,3] [,4] [,5]</span></span>
@@ -565,7 +571,7 @@ <h1 class="title">
 <span><span class="co">#&gt;  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <div class="{callout-warning}">
-<p>If the product of columns and rows does not match the length of the vector provided in the first argument, <code>matrix</code> recycles values. If the length of the vector is a sub-multiple or multiple of the number of rows this happens <strong>without warning</strong>:</p>
+<p>If the product of columns and rows does not match the length of the vector provided in the first argument, <code>matrix</code> recycles values. If the length of the vector is a sub-multiple or multiple of the number of rows, this happens <strong>without warning</strong>:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-13_18d255ec41b190cd594a185f93ba3abb">
 <div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">3</span>, <span class="fl">3</span>, <span class="fl">5</span><span class="op">)</span></span>
 <span><span class="co">#&gt;      [,1] [,2] [,3] [,4] [,5]</span></span>
@@ -574,17 +580,17 @@ <h1 class="title">
 <span><span class="co">#&gt; [3,]    3    3    3    3    3</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 </div>
-</section><section id="subsetting" class="level2" data-number="19.4"><h2 data-number="19.4" class="anchored" data-anchor-id="subsetting">
-<span class="header-section-number">19.4</span> Subsetting</h2>
-<p>We can extract a specific entry from matrix, for example the 300th row and 100th column, we use write:</p>
+</section><section id="subsetting" class="level2" data-number="20.4"><h2 data-number="20.4" class="anchored" data-anchor-id="subsetting">
+<span class="header-section-number">20.4</span> Subsetting</h2>
+<p>To extract a specific entry from a matrix, for example the 300th row of the 100th column, we write:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-14_413e7b628989b29b87df1cf03d3ccaf2">
 <div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span><span class="op">[</span><span class="fl">300</span>,<span class="fl">100</span><span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can extract subsets of the matrices by using vectors of indexes. As an example, we can extract the first 100 pixles from the first 300 observations like this: and rows like this:</p>
+<p>We can extract subsets of the matrices by using vectors of indexes. For example, we can extract the first 100 pixels from the first 300 observations like this:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-15_fe79008d6f1114c3a09aec6bbb3b37a4">
 <div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">300</span>,<span class="fl">1</span><span class="op">:</span><span class="fl">100</span><span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To extract an entire row or subset of rows, we leave the column dimension blank. So the following code returns all the pixes for the first 300 observations:</p>
+<p>To extract an entire row or subset of rows, we leave the column dimension blank. So the following code returns all the pixels for the first 300 observations:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-16_929a1c22d5e7bc585d14365ec5a32447">
 <div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">300</span>,<span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
@@ -598,12 +604,12 @@ <h1 class="title">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>If we subset just one row or just one column, the resulting object is no longer a matrix. Here is an example:</p>
+<p>If we subset just one row or just one column, the resulting object is no longer a matrix. For example:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-18_029f7715439b16f8656d61e5700c4e02">
 <div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span><span class="fl">300</span>,<span class="op">]</span><span class="op">)</span></span>
 <span><span class="co">#&gt; NULL</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To avoid this we can use the <code>drop</code> argument:</p>
+<p>To avoid this, we can use the <code>drop</code> argument:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-19_3db06f64aed8b86f3f75cb688d69851e">
 <div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span><span class="fl">100</span>,,drop <span class="op">=</span> <span class="cn">FALSE</span><span class="op">]</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1]   1 784</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -612,21 +618,24 @@ <h1 class="title">
 </div>
 </div>
 <section id="task-1-visualize-the-original-image" class="level3 unnumbered"><h3 class="unnumbered anchored" data-anchor-id="task-1-visualize-the-original-image">Task 1: Visualize the original image</h3>
-<p>As an example, let’s try to visualize the third observation. From the label we know this is a:</p>
+<p>For instance, let’s try to visualize the third observation. From the label, we know this is a:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-20_c7e8b681023f53a5d9ec6cbf633b5216">
 <div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">label</span><span class="op">[</span><span class="fl">3</span><span class="op">]</span></span>
 <span><span class="co">#&gt; [1] 4</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The third row of the matrix <code>x[3,]</code> contains the 784 pixels intesities. We can assume these were entered in order and convert them back to a <span class="math inline">\(28 \times 28\)</span> matrix using:</p>
+<p>The third row of the matrix <code>x[3,]</code> contains the 784 pixel intensities. We can assume these were entered in order and convert them back to a <span class="math inline">\(28 \times 28\)</span> matrix using:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-21_9491904231b29d3a5853b5719dd375bd">
 <div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">grid</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span><span class="fl">3</span>,<span class="op">]</span>, <span class="fl">28</span>, <span class="fl">28</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To visualize the data we can use <code>image</code>, which shows an image of its third argument, with the first two arguments to determine the position on the x and y axes, respectively. Because the top of this plot is pixel 1, which is shown at the bottom, the image is flipped. To code below includes code showing how to flip it back:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-22_aa8639197d35f1a6fff1ccfc581995d0">
-<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">28</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">28</span>, <span class="va">grid</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">28</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">28</span>, <span class="va">grid</span><span class="op">[</span>, <span class="fl">28</span><span class="op">:</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>To visualize the data, we can use <code>image</code> in the followin way:</p>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-22_a7e12b935eb14a25200ede0ce9a32914">
+<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">28</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">28</span>, <span class="va">grid</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>However, because the y-axis in <code>image</code> goes bottom to top and <code>x</code> stores pixels top to bottom the code above shows shows a flipped image. To flip it back we can use:</p>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-23_0910d8640cf98c0065ff5aee21a7286d">
+<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">28</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">28</span>, <span class="va">grid</span><span class="op">[</span>, <span class="fl">28</span><span class="op">:</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/matrix-image_57a6f619fb771cae5f8eb715fdb63dca">
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/matrix-image_3c9e5d1dd67e3e762e6038508c7d1c8b">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="matrices-in-R_files/figure-html/matrix-image-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -634,8 +643,8 @@ <h1 class="title">
 </div>
 </div>
 </div>
-</section></section><section id="sec-matrix-notation" class="level2" data-number="19.5"><h2 data-number="19.5" class="anchored" data-anchor-id="sec-matrix-notation">
-<span class="header-section-number">19.5</span> Mathematical notation</h2>
+</section></section><section id="sec-matrix-notation" class="level2" data-number="20.5"><h2 data-number="20.5" class="anchored" data-anchor-id="sec-matrix-notation">
+<span class="header-section-number">20.5</span> Mathematical notation</h2>
 <p>Matrices are usually represented with bold upper case letters:</p>
 <p><span class="math display">\[
 \mathbf{X} =
@@ -648,8 +657,8 @@ <h1 class="title">
 \]</span></p>
 <p>with <span class="math inline">\(x_{i,j}\)</span> representing the <span class="math inline">\(j\)</span>-the feature for the <span class="math inline">\(i\)</span>-th observation.</p>
 <p>We denote vectors with lower case bold letters and represent them as one column matrices, often referred to as <em>column vectors</em>. R follows this convention when converting a vector to a matrix:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-23_e97bf86315163de43261195876b5a677">
-<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span><span class="fl">300</span>,<span class="op">]</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-24_c7899579b7e51c9518aa6fc48ddfadde">
+<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span><span class="fl">300</span>,<span class="op">]</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 784   1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>However, <em>column vectors</em> should not be confused with the columns of the matrix. They have this name simply because they have one column.</p>
@@ -663,7 +672,7 @@ <h1 class="title">
 x_p
 \end{bmatrix}
 \]</span></p>
-<p>To distinguish between features associated with the observations <span class="math inline">\(i=1,\dots,n\)</span> we add an index:</p>
+<p>To distinguish between features associated with the observations <span class="math inline">\(i=1,\dots,n\)</span>, we add an index:</p>
 <p><span class="math display">\[
 \mathbf{x}_i = \begin{bmatrix}
 x_{i,1}\\
@@ -673,10 +682,10 @@ <h1 class="title">
 \end{bmatrix}
 \]</span></p>
 <div class="{callout-warning}">
-<p>Bold lower case letter are also commonly used to represent matrix columns rather than rows. This can be confusing because <span class="math inline">\(\mathbf{x}_1\)</span> can represent either the first row or the first column of <span class="math inline">\(\mathbf{X}\)</span>. One way to distinguish is using notation similar to computer code: using the colon <span class="math inline">\(:\)</span> to represent <em>all</em>. So <span class="math inline">\(\mathbf{X}_{1,:}\)</span> is a row, the first row and all the columns, and <span class="math inline">\(\mathbf{X}_{:,1}\)</span> is a column, the first column and all the rows. Another approach is to distinguish by the letter used to index, with <span class="math inline">\(i\)</span> used for rows and <span class="math inline">\(j\)</span> used for columns. So <span class="math inline">\(\mathbf{x}_i\)</span> is the <span class="math inline">\(i\)</span>th row and <span class="math inline">\(\mathbf{x}_j\)</span> is the <span class="math inline">\(j\)</span>th column. With this approach it is important to clarify which dimension, row or column, is being represented. Further confusion can arise because, as discussed, it is common to represent all vectors as 1 column matrices, including the rows of a matrix.</p>
+<p>Bold lower case letters are also commonly used to represent matrix columns rather than rows. This can be confusing because <span class="math inline">\(\mathbf{x}_1\)</span> can represent either the first row or the first column of <span class="math inline">\(\mathbf{X}\)</span>. One way to distinguish is to use notation similar to computer code: using the colon <span class="math inline">\(:\)</span> to represent <em>all</em>. So <span class="math inline">\(\mathbf{X}_{1,:}\)</span> represents the first row and <span class="math inline">\(\mathbf{X}_{:,1}\)</span> is the first column. Another approach is to distinguish by the letter used to index, with <span class="math inline">\(i\)</span> used for rows and <span class="math inline">\(j\)</span> used for columns. So <span class="math inline">\(\mathbf{x}_i\)</span> is the <span class="math inline">\(i\)</span>th row and <span class="math inline">\(\mathbf{x}_j\)</span> is the <span class="math inline">\(j\)</span>th column. With this approach, it is important to clarify which dimension, row or column is being represented. Further confusion can arise because, as aforementioned, it is common to represent all vectors, including the rows of a matrix, as one-column matrices.</p>
 </div>
-</section><section id="the-transpose" class="level2" data-number="19.6"><h2 data-number="19.6" class="anchored" data-anchor-id="the-transpose">
-<span class="header-section-number">19.6</span> The transpose</h2>
+</section><section id="the-transpose" class="level2" data-number="20.6"><h2 data-number="20.6" class="anchored" data-anchor-id="the-transpose">
+<span class="header-section-number">20.6</span> The transpose</h2>
 <p>A common operation when working with matrices is the <em>transpose</em>. We use the transpose to understand several concepts described in the next several sections. This operation simply converts the rows of a matrix into columns. We use the symbols <span class="math inline">\(\top\)</span> or <span class="math inline">\('\)</span> next to the bold upper case letter to denote the transpose:</p>
 <p><span class="math display">\[
 \text{if } \,
@@ -695,8 +704,8 @@ <h1 class="title">
   \end{bmatrix}
 \]</span></p>
 <p>In R we compute the transpose using the function <code>t</code></p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-24_5844d980ce1582a200b840642a97935f">
-<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-25_cea31190f951c8e26db2d06d6213cd1b">
+<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 60000   784</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1]   784 60000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -711,33 +720,33 @@ <h1 class="title">
 \mathbf{x}_n^\top
 \end{bmatrix}
 \]</span></p>
-</section><section id="row-and-column-summaries" class="level2" data-number="19.7"><h2 data-number="19.7" class="anchored" data-anchor-id="row-and-column-summaries">
-<span class="header-section-number">19.7</span> Row and column summaries</h2>
+</section><section id="row-and-column-summaries" class="level2" data-number="20.7"><h2 data-number="20.7" class="anchored" data-anchor-id="row-and-column-summaries">
+<span class="header-section-number">20.7</span> Row and column summaries</h2>
 <p>A common operation with matrices is to apply the same function to each row or to each column. For example, we may want to compute row averages and standard deviations. The <code>apply</code> function lets you do this. The first argument is the matrix, the second is the dimension, 1 for rows, 2 for columns, and the third is the function to be applied.</p>
-<p>So, for example, to compute the averages and standard deviations of each row we write:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-25_cd8694090d891d0d6677a7a6160237d1">
-<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">avgs</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/apply.html">apply</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">1</span>, <span class="va">mean</span><span class="op">)</span></span>
-<span><span class="va">sds</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/apply.html">apply</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">1</span>, <span class="va">sd</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>To compute these for the columns we simply change the 1 to a 2:</p>
+<p>So, for example, to compute the averages and standard deviations of each row, we write:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-26_9da6155632f56ee1c5fc0613fb905653">
 <div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">avgs</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/apply.html">apply</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">1</span>, <span class="va">mean</span><span class="op">)</span></span>
 <span><span class="va">sds</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/apply.html">apply</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">1</span>, <span class="va">sd</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Because these operations are so common, special functions are available to perform them. So, for example, the functions <code>rowMeans</code> computes the average of each row</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-27_5a760193464db15519323beccda0e4a2">
-<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">avg</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>To compute these for the columns, we simply change the 1 to a 2:</p>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-27_7a97d111d2cb189c2b6c114f5db753d6">
+<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">avgs</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/apply.html">apply</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">1</span>, <span class="va">mean</span><span class="op">)</span></span>
+<span><span class="va">sds</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/apply.html">apply</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">1</span>, <span class="va">sd</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Because these operations are so common, special functions are available to perform them. So, for example, the functions <code>rowMeans</code> computes the average of each row:</p>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-28_f307f59e777cb0f08d76d1ec7dc359da">
+<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">avg</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>and the <strong>matrixStats</strong> function <code>rowSds</code> computes the standard deviations for each row:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-28_3953311ed6245b0a10030753aa47452d">
-<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/HenrikBengtsson/matrixStats">matrixStats</a></span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-29_029e042f2b7479b97c8dcc8c423b78d5">
+<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/HenrikBengtsson/matrixStats">matrixStats</a></span><span class="op">)</span></span>
 <span><span class="va">sds</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/rowSds.html">rowSds</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The functions <code>colMeans</code>, and <code>colSds</code> provide the version for columns. For more fast implementations look at the functions available in <strong>matrixStats</strong>.</p>
+<p>The functions <code>colMeans</code> and <code>colSds</code> provide the version for columns. For more fast implementations consider the functions available in <strong>matrixStats</strong>.</p>
 <section id="task-2-do-some-digits-require-more-ink-to-write-than-others" class="level3 unnumbered"><h3 class="unnumbered anchored" data-anchor-id="task-2-do-some-digits-require-more-ink-to-write-than-others">Task 2: Do some digits require more ink to write than others?</h3>
 <p>For the second task, related to total pixel darkness, we want to see the average use of ink plotted against digit. We have already computed this average and can generate a boxplot to answer the question:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/boxplot-of-digit-averages_6beea53c170bf4159ce88d5c7f936e67">
-<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">avg</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
+<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">avg</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/boxplot.html">boxplot</a></span><span class="op">(</span><span class="va">avg</span> <span class="op">~</span> <span class="va">y</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -746,30 +755,30 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>From this plot we see that, not surprisingly, 1s use less ink than the other digits.</p>
-</section></section><section id="conditional-filtering" class="level2" data-number="19.8"><h2 data-number="19.8" class="anchored" data-anchor-id="conditional-filtering">
-<span class="header-section-number">19.8</span> Conditional filtering</h2>
-<p>One of the advantages of matrices operations over <strong>tidyverse</strong> operations, is that we can easily select columns based on summaries of the columns.</p>
-<p>Note that logical filters can be used to subset matrices in a similar way in which they can be used to subset vectors. Here is a simple examples subsetting columns with logicals:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-29_76901ca539fa0bd76f3436e0c169113a">
-<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">15</span>, <span class="fl">3</span>, <span class="fl">5</span><span class="op">)</span><span class="op">[</span>,<span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="cn">FALSE</span>, <span class="cn">TRUE</span>, <span class="cn">TRUE</span>, <span class="cn">FALSE</span>, <span class="cn">TRUE</span><span class="op">)</span><span class="op">]</span></span>
+<p>From this plot we see that, not surprisingly, 1s use less ink than other digits.</p>
+</section></section><section id="conditional-filtering" class="level2" data-number="20.8"><h2 data-number="20.8" class="anchored" data-anchor-id="conditional-filtering">
+<span class="header-section-number">20.8</span> Conditional filtering</h2>
+<p>One of the advantages of matrices operations over <strong>tidyverse</strong> operations is that we can easily select columns based on summaries of the columns.</p>
+<p>Note that logical filters can be used to subset matrices in a similar way in which they can be used to subset vectors. Here is a simple example subsetting columns with logicals:</p>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-30_bfd815328a7d9843b51fbf25cf205f08">
+<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">15</span>, <span class="fl">3</span>, <span class="fl">5</span><span class="op">)</span><span class="op">[</span>,<span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="cn">FALSE</span>, <span class="cn">TRUE</span>, <span class="cn">TRUE</span>, <span class="cn">FALSE</span>, <span class="cn">TRUE</span><span class="op">)</span><span class="op">]</span></span>
 <span><span class="co">#&gt;      [,1] [,2] [,3]</span></span>
 <span><span class="co">#&gt; [1,]    4    7   13</span></span>
 <span><span class="co">#&gt; [2,]    5    8   14</span></span>
 <span><span class="co">#&gt; [3,]    6    9   15</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This implies that we can select rows with conditional expression. Here is practical example that removes all observations containing at least one <code>NA</code>:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-30_f6b1f23357ae5f2570551ed940540a45">
-<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/apply.html">apply</a></span><span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, <span class="fl">1</span>, <span class="va">all</span><span class="op">)</span>,<span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>This implies that we can select rows with conditional expression. In the following example we remove all observations containing at least one <code>NA</code>:</p>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-31_719b7fc41fb2a15a15c17210e9114d89">
+<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/apply.html">apply</a></span><span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, <span class="fl">1</span>, <span class="va">all</span><span class="op">)</span>,<span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>This being a common operation, we have a <strong>matrixStats</strong> function to do it faster:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-31_d3e01f3c86146018904eb0fd31f5f2f0">
-<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span><span class="op">[</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/anyMissing.html">rowAnyNAs</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>,<span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-32_079d8ab4b33ae42e0a606cb905b7f81c">
+<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span><span class="op">[</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/anyMissing.html">rowAnyNAs</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>,<span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <section id="task-3-are-some-pixels-uninformative" class="level3 unnumbered"><h3 class="unnumbered anchored" data-anchor-id="task-3-are-some-pixels-uninformative">Task 3: Are some pixels uninformative?</h3>
-<p>We can use these ideas to remove columns associated with pixels that don’t change much and thus do not informing digit classification. We will quantify the variation of each pixel with its standard deviation across all entries. Since each column represents a pixel, we use the <code>colSds</code> function from the <strong>matrixStats</strong> package:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-32_bd60e20e1cabb0cbe5c483747959bc78">
-<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">sds</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/rowSds.html">colSds</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>We can use these ideas to remove columns associated with pixels that don’t change much and thus do not inform digit classification. We will quantify the variation of each pixel with its standard deviation across all entries. Since each column represents a pixel, we use the <code>colSds</code> function from the <strong>matrixStats</strong> package:</p>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-33_ca74de7c56ff5fbcec814d47a17b277a">
+<div class="sourceCode" id="cb33"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">sds</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/rowSds.html">colSds</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>A quick look at the distribution of these values shows that some pixels have very low entry to entry variability:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/sds-histogram_af7537fbb64e1e640ec0a4efdb220de2">
@@ -780,12 +789,12 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-33_c0c42fd16af1da624aae30814d3e009d">
-<div class="sourceCode" id="cb33"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">sds</span>, breaks <span class="op">=</span> <span class="fl">30</span>, main <span class="op">=</span> <span class="st">"SDs"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-34_c9d32f9b21cac4dda00498578a5d4f22">
+<div class="sourceCode" id="cb34"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">sds</span>, breaks <span class="op">=</span> <span class="fl">30</span>, main <span class="op">=</span> <span class="st">"SDs"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>This makes sense since we don’t write in some parts of the box. Here is the variance plotted by location:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-34_2e0226066908294c598194791cbee1b5">
-<div class="sourceCode" id="cb34"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">28</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">28</span>, <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="va">sds</span>, <span class="fl">28</span>, <span class="fl">28</span><span class="op">)</span><span class="op">[</span>, <span class="fl">28</span><span class="op">:</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-35_b40aca32fc75c021fb8bdddde357defe">
+<div class="sourceCode" id="cb35"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">28</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">28</span>, <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="va">sds</span>, <span class="fl">28</span>, <span class="fl">28</span><span class="op">)</span><span class="op">[</span>, <span class="fl">28</span><span class="op">:</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/pixel-variance_822aa873ea51ef58eec7f23e49acf3e5">
 <div class="cell-output-display">
@@ -798,18 +807,17 @@ <h1 class="title">
 <p>We see that there is little variation in the corners.</p>
 <p>We could remove features that have no variation since these can’t help us predict.</p>
 <p>So if we wanted to remove uninformative predictors from our matrix, we could write this one line of code:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-35_304d3eae8b7787419517bc9c4b4ad329">
-<div class="sourceCode" id="cb35"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">new_x</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[</span>,<span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/rowSds.html">colSds</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&gt;</span> <span class="fl">60</span><span class="op">]</span></span>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-36_d10a0cbce67edc1870ce708723c12043">
+<div class="sourceCode" id="cb36"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">new_x</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[</span>,<span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/rowSds.html">colSds</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&gt;</span> <span class="fl">60</span><span class="op">]</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">new_x</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 60000   322</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Only the columns for which the standard deviation is above 60 are kept, which removes over half the predictors.</p>
-</section></section><section id="indexing-with-matrices" class="level2" data-number="19.9"><h2 data-number="19.9" class="anchored" data-anchor-id="indexing-with-matrices">
-<span class="header-section-number">19.9</span> Indexing with matrices</h2>
-<p>A operation that facilitates efficient coding is that we can change entries of a matrix based on conditionals applied to that same matrix. Here is a simple example:</p>
-<p>To see what this does, we look at a smaller matrix:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-36_0eeb377677f090ac154cabeddc5b1641">
-<div class="sourceCode" id="cb36"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">15</span>, <span class="fl">3</span>, <span class="fl">5</span><span class="op">)</span></span>
+</section></section><section id="indexing-with-matrices" class="level2" data-number="20.9"><h2 data-number="20.9" class="anchored" data-anchor-id="indexing-with-matrices">
+<span class="header-section-number">20.9</span> Indexing with matrices</h2>
+<p>An operation that facilitates efficient coding is that we can change entries of a matrix based on conditionals applied to that same matrix. Here is a simple example:</p>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-37_0efa18445c047c8a5f42856ae8b5a04e">
+<div class="sourceCode" id="cb37"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">15</span>, <span class="fl">3</span>, <span class="fl">5</span><span class="op">)</span></span>
 <span><span class="va">mat</span><span class="op">[</span><span class="va">mat</span> <span class="op">&gt;</span> <span class="fl">6</span> <span class="op">&amp;</span> <span class="va">mat</span> <span class="op">&lt;</span> <span class="fl">12</span><span class="op">]</span> <span class="op">&lt;-</span> <span class="fl">0</span></span>
 <span><span class="va">mat</span></span>
 <span><span class="co">#&gt;      [,1] [,2] [,3] [,4] [,5]</span></span>
@@ -818,11 +826,11 @@ <h1 class="title">
 <span><span class="co">#&gt; [3,]    3    6    0   12   15</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>A useful application of this approach is that we can change all the <code>NA</code> entries of a matrix to something else:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-37_7ca1f94428b8eae3c9e029c989f0c277">
-<div class="sourceCode" id="cb37"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span><span class="op">[</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">]</span> <span class="op">&lt;-</span> <span class="fl">0</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-38_22a08d5711eae76525c07c42d915b1c1">
+<div class="sourceCode" id="cb38"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span><span class="op">[</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">]</span> <span class="op">&lt;-</span> <span class="fl">0</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<section id="task-4-can-we-remove-smudges" class="level3" data-number="19.9.1"><h3 data-number="19.9.1" class="anchored" data-anchor-id="task-4-can-we-remove-smudges">
-<span class="header-section-number">19.9.1</span> Task 4: Can we remove smudges?</h3>
+<section id="task-4-can-we-remove-smudges" class="level3" data-number="20.9.1"><h3 data-number="20.9.1" class="anchored" data-anchor-id="task-4-can-we-remove-smudges">
+<span class="header-section-number">20.9.1</span> Task 4: Can we remove smudges?</h3>
 <p>A histogram of all our predictor data:</p>
 <div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/histogram-all-pixels_f24c15fea09a29edbf9ea8195d4b5d06">
 <div class="cell-output-display">
@@ -832,27 +840,27 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-38_dbbdb336b894f42555ddf2335498c3e4">
-<div class="sourceCode" id="cb38"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/vector.html">as.vector</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, breaks <span class="op">=</span> <span class="fl">30</span>, main <span class="op">=</span> <span class="st">"Pixel intensities"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-39_753e31ad2c1ea40a7a5fe0e3a7a16d1c">
+<div class="sourceCode" id="cb39"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/vector.html">as.vector</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, breaks <span class="op">=</span> <span class="fl">30</span>, main <span class="op">=</span> <span class="st">"Pixel intensities"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>shows a clear dichotomy which is explained as parts of the image with ink and parts without. If we think that values below, say, 50 are smudges, we can quickly make them zero using:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-39_ed2647e09962f8eb345c9b5d30092fc2">
-<div class="sourceCode" id="cb39"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">new_x</span> <span class="op">&lt;-</span> <span class="va">x</span></span>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-40_32a8fd1297fdcfd499143927bf4af91e">
+<div class="sourceCode" id="cb40"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">new_x</span> <span class="op">&lt;-</span> <span class="va">x</span></span>
 <span><span class="va">new_x</span><span class="op">[</span><span class="va">new_x</span> <span class="op">&lt;</span> <span class="fl">50</span><span class="op">]</span> <span class="op">&lt;-</span> <span class="fl">0</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 </section><section id="task-5-binarizing-the-data" class="level3 unnumbered"><h3 class="unnumbered anchored" data-anchor-id="task-5-binarizing-the-data">Task 5: Binarizing the data</h3>
-<p>The histogram above seems to suggest that this data is mostly binary. A pixel either has ink or does not. Using what we have learned, we can binarize the data using just matrix operations:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-40_cf0ac984491d911ef3844051faca3aa3">
-<div class="sourceCode" id="cb40"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">bin_x</span> <span class="op">&lt;-</span> <span class="va">x</span></span>
+<p>The histogram above seems to suggest that this data is mostly binary. A pixel either has ink or does not. Applying what we have learned, we can binarize the data using just matrix operations:</p>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-41_ca21962dd8422ddedfbbd63fe14dcf93">
+<div class="sourceCode" id="cb41"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">bin_x</span> <span class="op">&lt;-</span> <span class="va">x</span></span>
 <span><span class="va">bin_x</span><span class="op">[</span><span class="va">bin_x</span> <span class="op">&lt;</span> <span class="fl">255</span><span class="op">/</span><span class="fl">2</span><span class="op">]</span> <span class="op">&lt;-</span> <span class="fl">0</span> </span>
 <span><span class="va">bin_x</span><span class="op">[</span><span class="va">bin_x</span> <span class="op">&gt;</span> <span class="fl">255</span><span class="op">/</span><span class="fl">2</span><span class="op">]</span> <span class="op">&lt;-</span> <span class="fl">1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We can also convert to a matrix of logicals and then coerce to numbers like this:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-41_4977976556cdaf3c44c20997caad063e">
-<div class="sourceCode" id="cb41"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">bin_X</span> <span class="op">&lt;-</span> <span class="op">(</span><span class="va">x</span> <span class="op">&gt;</span> <span class="fl">255</span><span class="op">/</span><span class="fl">2</span><span class="op">)</span><span class="op">*</span><span class="fl">1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-42_ba27218a4fa78a7bffa85d6e15fc73f0">
+<div class="sourceCode" id="cb42"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">bin_X</span> <span class="op">&lt;-</span> <span class="op">(</span><span class="va">x</span> <span class="op">&gt;</span> <span class="fl">255</span><span class="op">/</span><span class="fl">2</span><span class="op">)</span><span class="op">*</span><span class="fl">1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section></section><section id="vectorization-for-matrices" class="level2" data-number="19.10"><h2 data-number="19.10" class="anchored" data-anchor-id="vectorization-for-matrices">
-<span class="header-section-number">19.10</span> Vectorization for matrices</h2>
+</section></section><section id="vectorization-for-matrices" class="level2" data-number="20.10"><h2 data-number="20.10" class="anchored" data-anchor-id="vectorization-for-matrices">
+<span class="header-section-number">20.10</span> Vectorization for matrices</h2>
 <p>In R, if we subtract a vector from a matrix, the first element of the vector is subtracted from the first row, the second element from the second row, and so on. Using mathematical notation, we would write it as follows:</p>
 <p><span class="math display">\[
 \begin{bmatrix}
@@ -877,37 +885,37 @@ <h1 class="title">
   \end{bmatrix}
 \]</span></p>
 <p>The same holds true for other arithmetic operations.</p>
-<p>The function <code>sweep</code> facilitates this type of operation. It works similarly to <code>apply</code>. It takes each entry of a vector and applies an arithmetic operation to the corresponding row. Subtraction is the default artihmetic operation. So, for example, to center each row around the avarage we can use:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-42_52a0cd674d95f12256b9ff193ae03f90">
-<div class="sourceCode" id="cb42"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">1</span>, <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>The function <code>sweep</code> facilitates this type of operation. It works similarly to <code>apply</code>. It takes each entry of a vector and applies an arithmetic operation to the corresponding row. Subtraction is the default arithmetic operation. So, for example, to center each row around the average, we can use:</p>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-43_c1d40219bb6baf8b6b9142c14b51a1bc">
+<div class="sourceCode" id="cb43"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">1</span>, <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <section id="task-6-standardize-the-digits" class="level3 unnumbered"><h3 class="unnumbered anchored" data-anchor-id="task-6-standardize-the-digits">Task 6: Standardize the digits</h3>
-<p>The way R vectorizes arithmetic opearions implies that we can scale each row of a matrix like this:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-43_e69b1a3745450c01e1803eb71658153b">
-<div class="sourceCode" id="cb43"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="op">(</span><span class="va">x</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/rowSds.html">rowSds</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>The way R vectorizes arithmetic operations implies that we can scale each row of a matrix as follows:</p>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-44_fd3bb5cb856385051f83a0b56907dfe9">
+<div class="sourceCode" id="cb44"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="op">(</span><span class="va">x</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/rowSds.html">rowSds</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>If you want to scale each column, be careful since this approach does not work for columns. For columns we can <code>sweep</code>:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-44_1777ea5b9159865aada757e71160a303">
-<div class="sourceCode" id="cb44"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x_mean_0</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">2</span>, <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colMeans</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Yet this approach does not work for columns. For columns, we can <code>sweep</code>:</p>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-45_b6e9b11dccab45543bdf18c68b7c1f19">
+<div class="sourceCode" id="cb45"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x_mean_0</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">2</span>, <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colMeans</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>To divide by the standard deviation, we change the default arithmetic operation to division as follows:</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-45_e293eee177364622dc4ae0daadce4c22">
-<div class="sourceCode" id="cb45"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x_standardized</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">x_mean_0</span>, <span class="fl">2</span>, <span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/rowSds.html">colSds</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, FUN <span class="op">=</span> <span class="st">"/"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-46_8178e8d9320ceea708566ccbed718b6b">
+<div class="sourceCode" id="cb46"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x_standardized</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">x_mean_0</span>, <span class="fl">2</span>, <span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/rowSds.html">colSds</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, FUN <span class="op">=</span> <span class="st">"/"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>In R, if you add, subtract, multiple or divide two matrices, the operation is done elementwise. For example, if two matrices are stored in <code>x</code> and <code>y</code>, then</p>
-<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-46_88da9c27a77561153c650a6ca8025fa4">
-<div class="sourceCode" id="cb46"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span><span class="op">*</span><span class="va">y</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>In R, if you add, subtract, multiple or divide two matrices, the operation is done elementwise. For example, if two matrices are stored in <code>x</code> and <code>y</code>, then:</p>
+<div class="cell" data-layout-align="center" data-hash="matrices-in-R_cache/html/unnamed-chunk-47_f766aeb3cab0b4d9e928e2b163da67e2">
+<div class="sourceCode" id="cb47"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span><span class="op">*</span><span class="va">y</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>does not result in matrix multiplication. Instead, the entry in row <span class="math inline">\(i\)</span> and column <span class="math inline">\(j\)</span> of this product is the product of the entryin row <span class="math inline">\(i\)</span> and column <span class="math inline">\(j\)</span> of <code>x</code> and <code>y</code>, respectively.</p>
-</section></section><section id="exercises" class="level2" data-number="19.11"><h2 data-number="19.11" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">19.11</span> Exercises</h2>
+<p>does not result in matrix multiplication. Instead, the entry in row <span class="math inline">\(i\)</span> and column <span class="math inline">\(j\)</span> of this product is the product of the entry in row <span class="math inline">\(i\)</span> and column <span class="math inline">\(j\)</span> of <code>x</code> and <code>y</code>, respectively.</p>
+</section></section><section id="exercises" class="level2" data-number="20.11"><h2 data-number="20.11" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">20.11</span> Exercises</h2>
 <p>1. Create a 100 by 10 matrix of randomly generated normal numbers. Put the result in <code>x</code>.</p>
 <p>2. Apply the three R functions that give you the dimension of <code>x</code>, the number of rows of <code>x</code>, and the number of columns of <code>x</code>, respectively.</p>
 <p>3. Add the scalar 1 to row 1, the scalar 2 to row 2, and so on, to the matrix <code>x</code>.</p>
-<p>4. Add the scalar 1 to column 1, the scalar 2 to column 2, and so on, to the matrix <code>x</code>. Hint: use <code>sweep</code> with <code>FUN = "+"</code>.</p>
+<p>4. Add the scalar 1 to column 1, the scalar 2 to column 2, and so on, to the matrix <code>x</code>. Hint: Use <code>sweep</code> with <code>FUN = "+"</code>.</p>
 <p>5. Compute the average of each row of <code>x</code>.</p>
 <p>6. Compute the average of each column of <code>x</code>.</p>
-<p>7. For each digit in the MNIST training data, compute the proportion of pixels that are in a <em>grey area</em>, defined as values between 50 and 205. Make boxplot by digit class. Hint: use logical operators and <code>rowMeans</code>.</p>
+<p>7. For each digit in the MNIST training data, compute the proportion of pixels that are in a <em>grey area</em>, defined as values between 50 and 205. Make a boxplot by digit class. Hint: Use logical operators and <code>rowMeans</code>.</p>
 
 
 </section><section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes"><hr>
@@ -1152,7 +1160,7 @@ <h1 class="title">
   </div>
   <div class="nav-page nav-page-next">
       <a href="../highdim/linear-algebra.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/highdim/matrices-in-R_files/figure-html/boxplot-of-digit-averages-1.png b/docs/highdim/matrices-in-R_files/figure-html/boxplot-of-digit-averages-1.png
index c946e63..5c6c3bc 100644
Binary files a/docs/highdim/matrices-in-R_files/figure-html/boxplot-of-digit-averages-1.png and b/docs/highdim/matrices-in-R_files/figure-html/boxplot-of-digit-averages-1.png differ
diff --git a/docs/highdim/matrices-in-R_files/figure-html/histogram-all-pixels-1.png b/docs/highdim/matrices-in-R_files/figure-html/histogram-all-pixels-1.png
index b53d261..ecfeb47 100644
Binary files a/docs/highdim/matrices-in-R_files/figure-html/histogram-all-pixels-1.png and b/docs/highdim/matrices-in-R_files/figure-html/histogram-all-pixels-1.png differ
diff --git a/docs/highdim/matrices-in-R_files/figure-html/matrix-image-1.png b/docs/highdim/matrices-in-R_files/figure-html/matrix-image-1.png
index ae37d9b..38a4e80 100644
Binary files a/docs/highdim/matrices-in-R_files/figure-html/matrix-image-1.png and b/docs/highdim/matrices-in-R_files/figure-html/matrix-image-1.png differ
diff --git a/docs/highdim/matrices-in-R_files/figure-html/pixel-variance-1.png b/docs/highdim/matrices-in-R_files/figure-html/pixel-variance-1.png
index d1f18ac..7759dbc 100644
Binary files a/docs/highdim/matrices-in-R_files/figure-html/pixel-variance-1.png and b/docs/highdim/matrices-in-R_files/figure-html/pixel-variance-1.png differ
diff --git a/docs/highdim/matrices-in-R_files/figure-html/sds-histogram-1.png b/docs/highdim/matrices-in-R_files/figure-html/sds-histogram-1.png
index 431e6e5..bc82bcd 100644
Binary files a/docs/highdim/matrices-in-R_files/figure-html/sds-histogram-1.png and b/docs/highdim/matrices-in-R_files/figure-html/sds-histogram-1.png differ
diff --git a/docs/highdim/matrix-factorization.html b/docs/highdim/matrix-factorization.html
index 06b9787..4195486 100644
--- a/docs/highdim/matrix-factorization.html
+++ b/docs/highdim/matrix-factorization.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 23&nbsp; Matrix factorization</title>
+<title>Advanced Data Science - 24&nbsp; Matrix Factorization</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../highdim/intro-highdim.html">High dimensional data</a></li><li class="breadcrumb-item"><a href="../highdim/matrix-factorization.html"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../highdim/intro-highdim.html">High dimensional data</a></li><li class="breadcrumb-item"><a href="../highdim/matrix-factorization.html"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,15 +405,22 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#sec-factor-analysis" id="toc-sec-factor-analysis" class="nav-link active" data-scroll-target="#sec-factor-analysis"><span class="header-section-number">23.1</span> Factor analysis</a></li>
-  <li><a href="#connection-to-svd-and-pca" id="toc-connection-to-svd-and-pca" class="nav-link" data-scroll-target="#connection-to-svd-and-pca"><span class="header-section-number">23.2</span> Connection to SVD and PCA</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">23.3</span> Exercises</a></li>
+<li><a href="#sec-factor-analysis" id="toc-sec-factor-analysis" class="nav-link active" data-scroll-target="#sec-factor-analysis"><span class="header-section-number">24.1</span> Factor analysis</a></li>
+  <li><a href="#connection-to-pca" id="toc-connection-to-pca" class="nav-link" data-scroll-target="#connection-to-pca"><span class="header-section-number">24.2</span> Connection to PCA</a></li>
+  <li>
+<a href="#case-study-movie-recommendations" id="toc-case-study-movie-recommendations" class="nav-link" data-scroll-target="#case-study-movie-recommendations"><span class="header-section-number">24.3</span> Case study: movie recommendations</a>
+  <ul class="collapse">
+<li><a href="#visualizing-factors" id="toc-visualizing-factors" class="nav-link" data-scroll-target="#visualizing-factors"><span class="header-section-number">24.3.1</span> Visualizing factors</a></li>
+  </ul>
+</li>
+  <li><a href="#singular-value-decomposition" id="toc-singular-value-decomposition" class="nav-link" data-scroll-target="#singular-value-decomposition"><span class="header-section-number">24.4</span> Singular Value Decomposition</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">24.5</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/highdim/matrix-factorization.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
 <h1 class="title">
-<span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span>
+<span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span>
 </h1>
 </div>
 
@@ -421,22 +434,20 @@ <h1 class="title">
   </div>
   
 
-</header><p>Matrix factorization is a widely used concept in machine learning. It is very much related to factor analysis, singular value decomposition (SVD), and principal component analysis (PCA). Here we describe the concept in the context of movie recommendation systems.</p>
-<p>We have described how the model:</p>
+</header><p>In the previous chapter, we described how the model:</p>
 <p><span class="math display">\[
-Y_{u,i} = \mu + b_i + b_u + \varepsilon_{u,i}
+Y_{i,j} = \mu + \alpha_i + \beta_j + \varepsilon_{i,j}
 \]</span></p>
-<p>accounts for movie to movie differences through the <span class="math inline">\(b_i\)</span> and user to user differences through the <span class="math inline">\(b_u\)</span>. But this model leaves out an important source of variation related to the fact that groups of movies have similar rating patterns and groups of users have similar rating patterns as well. We will discover these patterns by studying the residuals:</p>
+<p>can be used to model movie ratings and help make useful predictions. This model accounts for user differences through <span class="math inline">\(\alpha_i\)</span> and movie effects through <span class="math inline">\(\beta_j\)</span>. However, the model ignores an important source of information related to the fact that groups of movies, have similar rating patterns and groups of users have similar rating patterns as well.</p>
+<p>To see an example of this, we compute residuals:</p>
 <p><span class="math display">\[
-r_{u,i} = y_{u,i} - \hat{b}_i - \hat{b}_u
-\]</span></p>
-<p>We can compute these residuals for the model we fit in the previous section:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-2_594368fd531b5e24b6b08a400aabc180">
-<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">r</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">y</span> <span class="op">-</span> <span class="va">mu</span>, <span class="fl">2</span>, <span class="va">fit_movies</span><span class="op">$</span><span class="va">b_i_reg</span><span class="op">)</span> <span class="op">-</span> <span class="va">fit_users</span><span class="op">$</span><span class="va">b_u</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">r</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">movie_map</span>, <span class="va">title</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/match.html">match</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">r</span><span class="op">)</span>, <span class="va">movieId</span><span class="op">)</span><span class="op">]</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+r_{i,j} = y_{i,j} - (\hat{\mu} + \hat{\alpha}_i + \hat{\beta}_j)
+\]</span> using the <code>mu</code>, <code>a</code> and <code>b_reg</code> computed in the previous chapter:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-2_58b7ba0949f556a16d6650e3b660d08c">
+<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">r</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">y</span> <span class="op">-</span> <span class="va">mu</span> <span class="op">-</span> <span class="va">a</span>,  <span class="fl">2</span>, <span class="va">b_reg</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>If the movie and user effect model explains all the signal, and the <span class="math inline">\(\varepsilon\)</span> are just noise, then the residuals for different movies should be independent from each other. But they are not. Here are some examples:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/movie-cor_8064367c48cf8ecf47c1a0d357d799fc">
+<p>and see how the residuals for three different movies correlate with <em>The Godfather</em>:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/movie-cor_eba3f385d82e5369608228a6300c9df5">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="matrix-factorization_files/figure-html/movie-cor-1.png" class="img-fluid figure-img" style="width:100.0%"></p>
@@ -444,194 +455,243 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>This plot says that users that liked The Godfather more than what the model expects them to, based on the movie and user effects, also liked The Godfather II more than expected. A similar relationship is seen when comparing The Godfather and Goodfellas. Although not as strong, there is still correlation. We see correlations between You’ve Got Mail and Sleepless in Seattle as well</p>
-<p>By looking at the correlation between movies, we can see a pattern (we rename the columns to save print space):</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-3_6623e489407d84c4d33fa1abfd7333cd">
-<pre><code>#&gt;            Godfather Godfather2 Goodfellas You've Got Sleepless
-#&gt; Godfather      1.000    0.82696      0.438     -0.285  -0.10748
-#&gt; Godfather2     0.827    1.00000      0.574     -0.268  -0.00675
-#&gt; Goodfellas     0.438    0.57445      1.000     -0.293  -0.27153
-#&gt; You've Got    -0.285   -0.26767     -0.293      1.000   0.53617
-#&gt; Sleepless     -0.107   -0.00675     -0.272      0.536   1.00000</code></pre>
+<p>We see that the correlation varies from very strong to negative across the other three movies.</p>
+<p>In this chapter, we introduce Factor Analysis, an approach that permits us to model these correlations and improve our prediction, and the Singular Value Decomposition, which permits us to fit the model. As we will see, this approach is related to principal component analysis (PCA). We describe these concepts in the context of movie recommendation systems.</p>
+<section id="sec-factor-analysis" class="level2" data-number="24.1"><h2 data-number="24.1" class="anchored" data-anchor-id="sec-factor-analysis">
+<span class="header-section-number">24.1</span> Factor analysis</h2>
+<p>We start with a simple illustration. We simulate <span class="math inline">\(\varepsilon_{i,j}\)</span> for 6 movies and 120 users and save it in <code>e</code>. If we examine the correlation, we notice a pattern:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-4_dc8d91b400c630f22bedf8c697a57dda">
+<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">e</span><span class="op">)</span></span>
+<span><span class="co">#&gt;                      Godfather Godfather 2 Goodfellas Scent of a Woman</span></span>
+<span><span class="co">#&gt; Godfather                1.000       0.671      0.558           -0.527</span></span>
+<span><span class="co">#&gt; Godfather 2              0.671       1.000      0.471           -0.450</span></span>
+<span><span class="co">#&gt; Goodfellas               0.558       0.471      1.000           -0.888</span></span>
+<span><span class="co">#&gt; Scent of a Woman        -0.527      -0.450     -0.888            1.000</span></span>
+<span><span class="co">#&gt; You've Got Mail         -0.734      -0.649     -0.487            0.451</span></span>
+<span><span class="co">#&gt; Sleepless in Seattle    -0.721      -0.739     -0.505            0.475</span></span>
+<span><span class="co">#&gt;                      You've Got Mail Sleepless in Seattle</span></span>
+<span><span class="co">#&gt; Godfather                     -0.734               -0.721</span></span>
+<span><span class="co">#&gt; Godfather 2                   -0.649               -0.739</span></span>
+<span><span class="co">#&gt; Goodfellas                    -0.487               -0.505</span></span>
+<span><span class="co">#&gt; Scent of a Woman               0.451                0.475</span></span>
+<span><span class="co">#&gt; You've Got Mail                1.000                0.756</span></span>
+<span><span class="co">#&gt; Sleepless in Seattle           0.756                1.000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>It seems there is positive correlation within mob and romance movies, and negative across the two genres. In statistics, we define <em>factors</em> as unobserved or <em>latent</em> variables that are inferred from the patterns of correlations or associations between the observed variables. We can quantify a factor that distinguishes between mob and romance movies with:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-5_8ecf92f0239b5df40c5c631cb22b3512">
+<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">q</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>, <span class="op">-</span><span class="fl">1</span>, <span class="op">-</span><span class="fl">1</span>, <span class="fl">1</span>, <span class="fl">1</span>, <span class="fl">1</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>To determine which users prefer each genre, we can fit a linear model to each user:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-6_e393d8b90469922fde6882660c471575">
+<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/apply.html">apply</a></span><span class="op">(</span><span class="va">e</span>, <span class="fl">1</span>, <span class="kw">function</span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">y</span><span class="op">~</span><span class="va">q</span><span class="op">-</span><span class="fl">1</span><span class="op">)</span><span class="op">$</span><span class="va">coef</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>There seems to be people that like romantic comedies more than expected, while others that like gangster movies more than expected.</p>
-<p>These results tell us that there is structure in the data. But how can we model this?</p>
-<section id="sec-factor-analysis" class="level2" data-number="23.1"><h2 data-number="23.1" class="anchored" data-anchor-id="sec-factor-analysis">
-<span class="header-section-number">23.1</span> Factor analysis</h2>
-<p>Here is an illustration, using a simulation, of how we can use some structure to predict the <span class="math inline">\(r_{u,i}\)</span>. Suppose our residuals <code>r</code> look like this:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-5_c0fcc4824b0394834187b9ef72b20540">
-<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">r</span>, <span class="fl">1</span><span class="op">)</span></span>
-<span><span class="co">#&gt;    Godfather Godfather2 Goodfellas You've Got Sleepless</span></span>
-<span><span class="co">#&gt; 1        2.1        2.5        2.4       -1.6      -1.7</span></span>
-<span><span class="co">#&gt; 2        1.9        1.4        2.0       -1.8      -1.3</span></span>
-<span><span class="co">#&gt; 3        1.8        2.7        2.3       -2.7      -2.0</span></span>
-<span><span class="co">#&gt; 4       -0.5        0.7        0.6       -0.8      -0.5</span></span>
-<span><span class="co">#&gt; 5       -0.6       -0.8        0.6        0.4       0.6</span></span>
-<span><span class="co">#&gt; 6       -0.1        0.2        0.5       -0.7       0.4</span></span>
-<span><span class="co">#&gt; 7       -0.3       -0.1       -0.4       -0.4       0.7</span></span>
-<span><span class="co">#&gt; 8        0.3        0.4        0.3        0.0       0.7</span></span>
-<span><span class="co">#&gt; 9       -1.4       -2.2       -1.5        2.0       2.8</span></span>
-<span><span class="co">#&gt; 10      -2.6       -1.5       -1.3        1.6       1.3</span></span>
-<span><span class="co">#&gt; 11      -1.5       -2.0       -2.2        1.7       2.7</span></span>
-<span><span class="co">#&gt; 12      -1.5       -1.4       -2.3        2.5       2.0</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Notice we use the <code>-1</code> because the errors have mean 0 and we don’t need an intercept.</p>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
 </div>
-<p>There seems to be a pattern here. In fact, we can see very strong correlation patterns:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-6_57de7444c7dd57ff70c58910dc68487f">
-<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">r</span><span class="op">)</span> </span>
-<span><span class="co">#&gt;            Godfather Godfather2 Goodfellas You've Got Sleepless</span></span>
-<span><span class="co">#&gt; Godfather      1.000      0.923      0.911     -0.898    -0.863</span></span>
-<span><span class="co">#&gt; Godfather2     0.923      1.000      0.937     -0.950    -0.969</span></span>
-<span><span class="co">#&gt; Goodfellas     0.911      0.937      1.000     -0.949    -0.956</span></span>
-<span><span class="co">#&gt; You've Got    -0.898     -0.950     -0.949      1.000     0.945</span></span>
-<span><span class="co">#&gt; Sleepless     -0.863     -0.969     -0.956      0.945     1.000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="callout-body-container">
+<p>There is a much faster way to make this computation using linear algebra. This is because the <code>lm</code> function is computing the least squares estimates by taking the derivative of the sum of squares, equaling it to 0, and noting the solution <span class="math inline">\(\hat{\boldsymbol{\beta}}\)</span> satisfies:</p>
+<p><span class="math display">\[
+(\mathbf{q}^\top\mathbf{q}) \, \hat{\boldsymbol{\beta}}  = \mathbf{q}^\top \mathbf{y}
+\]</span> with <span class="math inline">\(\mathbf{y}\)</span> the row of <code>e</code> passed to <code>y</code> in the apply function. Because <span class="math inline">\(\mathbf{q}\)</span> does not change for each user, rather than have <code>lm</code> recompute the equation for each user, we can perform the calculation on each column of <span class="math inline">\(e\)</span> to get the <span class="math inline">\(\beta_j\)</span> for all users <span class="math inline">\(j\)</span> like this:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-7_a4e34c5efb226199b645bc6bc19a9bf1">
+<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/qr.html">qr.solve</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/crossprod.html">crossprod</a></span><span class="op">(</span><span class="va">q</span><span class="op">)</span><span class="op">)</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">q</span><span class="op">)</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">e</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+</div>
+</div>
+</div>
+<p>The histogram below shows there are three type of users: those that love mob movies and hate romance movies, those that don’t care, and those that love romance movies and hate mob movies.</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-8_2aca162f31af8a7e8f62c42662998b8c">
+<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">p</span>, breaks <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="op">-</span><span class="fl">2</span>,<span class="fl">2</span>,<span class="fl">0.1</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output-display">
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure"><p><img src="matrix-factorization_files/figure-html/unnamed-chunk-8-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>To see that we can approximate <span class="math inline">\(\varepsilon_{i,j}\)</span> with $p_iq_j we convert the vectors to matrices and use linear algebra:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-9_fe1053d44518ef68d9f5505fd311031a">
+<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="va">p</span><span class="op">)</span>; <span class="va">q</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="va">q</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">p</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">q</span><span class="op">)</span>, <span class="va">e</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output-display">
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure"><p><img src="matrix-factorization_files/figure-html/unnamed-chunk-9-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>However, after removing this mob/romance effect, we still see structure in the correlation:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-10_883831ab76726a08270268ca53ce2803">
+<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">e</span> <span class="op">-</span> <span class="va">p</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">q</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="co">#&gt;                      Godfather Godfather 2 Goodfellas Scent of a Woman</span></span>
+<span><span class="co">#&gt; Godfather                1.000       0.185     -0.545            0.557</span></span>
+<span><span class="co">#&gt; Godfather 2              0.185       1.000     -0.618            0.594</span></span>
+<span><span class="co">#&gt; Goodfellas              -0.545      -0.618      1.000           -0.671</span></span>
+<span><span class="co">#&gt; Scent of a Woman         0.557       0.594     -0.671            1.000</span></span>
+<span><span class="co">#&gt; You've Got Mail         -0.280      -0.186      0.619           -0.641</span></span>
+<span><span class="co">#&gt; Sleepless in Seattle    -0.198      -0.364      0.650           -0.656</span></span>
+<span><span class="co">#&gt;                      You've Got Mail Sleepless in Seattle</span></span>
+<span><span class="co">#&gt; Godfather                     -0.280               -0.198</span></span>
+<span><span class="co">#&gt; Godfather 2                   -0.186               -0.364</span></span>
+<span><span class="co">#&gt; Goodfellas                     0.619                0.650</span></span>
+<span><span class="co">#&gt; Scent of a Woman              -0.641               -0.656</span></span>
+<span><span class="co">#&gt; You've Got Mail                1.000                0.353</span></span>
+<span><span class="co">#&gt; Sleepless in Seattle           0.353                1.000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>This structure seems to be driven by Al Pacino being in the movie or not. This implies we could add another factor:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-11_f68ebae43ddec6d288ad1ba61e2abe67">
+<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">q</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/cbind.html">cbind</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>, <span class="op">-</span><span class="fl">1</span>, <span class="op">-</span><span class="fl">1</span>, <span class="fl">1</span>, <span class="fl">1</span>, <span class="fl">1</span><span class="op">)</span>,</span>
+<span>           <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">1</span>, <span class="op">-</span><span class="fl">1</span>, <span class="fl">1</span>, <span class="op">-</span><span class="fl">1</span>, <span class="fl">1</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>We can then obtain estimates for each user:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-12_45453bc7a2859fc5dc0fd1c6f25cf277">
+<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/apply.html">apply</a></span><span class="op">(</span><span class="va">e</span>, <span class="fl">1</span>, <span class="kw">function</span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">y</span><span class="op">~</span><span class="va">q</span><span class="op">-</span><span class="fl">1</span><span class="op">)</span><span class="op">$</span><span class="va">coefficient</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Note that we use the transpose <code>t</code> because <code>apply</code> binds results into columns and we want a row for each user.</p>
+<p>Our approximation based on two factors does a even better job of predicting how our residuals deviate from 0:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-13_90198675a6f8e9f0eee738a1eff5a01f">
+<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">p</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">q</span><span class="op">)</span>, <span class="va">e</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output-display">
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure"><p><img src="matrix-factorization_files/figure-html/unnamed-chunk-13-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
+</figure>
 </div>
-<p>We can create vectors <code>q</code> and <code>p</code>, that can explain much of the structure we see. The <code>q</code> would look like this:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-7_9a44bafcbd58a49cf4b32a7ba1f590a6">
-<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">q</span><span class="op">)</span> </span>
-<span><span class="co">#&gt;      Godfather Godfather2 Goodfellas You've Got Sleepless</span></span>
-<span><span class="co">#&gt; [1,]         1          1          1         -1        -1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>and it narrows down movies to two groups: gangster (coded with 1) and romance (coded with -1). We can also reduce the users to three groups:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-8_2c5eb6b8c3f85629ddbbfecdd786ef9b">
-<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">p</span><span class="op">)</span></span>
-<span><span class="co">#&gt;      1 2 3 4 5 6 7 8  9 10 11 12</span></span>
-<span><span class="co">#&gt; [1,] 2 2 2 0 0 0 0 0 -2 -2 -2 -2</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>those that like gangster movies and dislike romance movies (coded as 2), those that like romance movies and dislike gangster movies (coded as -2), and those that don’t care (coded as 0). The main point here is that we can almost reconstruct <span class="math inline">\(r\)</span>, which has 60 values, with a couple of vectors totaling 17 values. Note that <code>p</code> and <code>q</code> are equivalent to the patterns and weights we described in Section <a href="dimension-reduction.html#sec-pca"><span>Section&nbsp;21.5</span></a>.</p>
-<p>If <span class="math inline">\(r\)</span> contains the residuals for users <span class="math inline">\(u=1,\dots,12\)</span> for movies <span class="math inline">\(i=1,\dots,5\)</span> we can write the following mathematical formula for our residuals <span class="math inline">\(r_{u,i}\)</span>.</p>
+<p>This analysis provides insights into the process generating our data. Note that it also provides compression: the <span class="math inline">\(120 \times 6\)</span> matrix <span class="math inline">\(\boldsymbol{\varepsilon}\)</span>, with 720 observation, is well approximated by a matrix multiplication of a <span class="math inline">\(120 \times 2\)</span> matrix <span class="math inline">\(\mathbf{P}\)</span> and a <span class="math inline">\(6 \times 2\)</span> matrix <span class="math inline">\(\mathbf{Q}\)</span>, a total of 252 parameters.</p>
+<p>Our approximation with two factors can be written as:</p>
 <p><span class="math display">\[
-r_{u,i} \approx p_u q_i
+\varepsilon_{i,j} \approx p_{i,1}q_{j,1} + p_{i,2}q_{j,2} \mbox{ or } \boldsymbol{\varepsilon} = \mathbf{P}\mathbf{Q}^\top
 \]</span></p>
-<p>This implies that we can explain more variability by modifying our previous model for movie recommendations to:</p>
+<p>In our example with simulated data, we deduced the factors <span class="math inline">\(\mathbf{p}_1\)</span> and <span class="math inline">\(\mathbf{p}_2\)</span> from the sample correlation and our knowledge of movies. These ended up working well. However, in general deducing factors is not this easy. Furthermore, factors that provide good approximation might be more complicated than containing just two values. For example, <em>The Godfather III</em> might be considered both a mob and romance movie and we would not know what value to assign it in <code>q</code>.</p>
+<p>So, can we estimate the factors? A challenge is that if <span class="math inline">\(\mathbf{P}\)</span> is unknown our model is no longer linear: we can use <code>lm</code> to estimate both <span class="math inline">\(\mathbf{P}\)</span> and <span class="math inline">\(\mathbf{Q}\)</span>. In the next section, we describe a technique that permits us to estimate to this.</p>
+</section><section id="connection-to-pca" class="level2" data-number="24.2"><h2 data-number="24.2" class="anchored" data-anchor-id="connection-to-pca">
+<span class="header-section-number">24.2</span> Connection to PCA</h2>
+<p>Notice that if we perform PCA on the matrix <span class="math inline">\(\boldsymbol{\varepsilon}\)</span>, we obtain a transformation <span class="math inline">\(\mathbf{V}\)</span> that permits us to rewrite:</p>
 <p><span class="math display">\[
-Y_{u,i} = \mu + b_i + b_u + p_u q_i + \varepsilon_{u,i}
+\boldsymbol{\varepsilon} = \mathbf{Z} \mathbf{V}^\top
 \]</span></p>
-<p>However, we motivated the need for the <span class="math inline">\(p_u q_i\)</span> term with a simple simulation. The structure found in data is usually more complex. For example, in this first simulation we assumed there were was just one factor <span class="math inline">\(p_u\)</span> that determined which of the two genres movie <span class="math inline">\(u\)</span> belongs to. But the structure in our movie data seems to be much more complicated than gangster movie versus romance. We may have many other factors. Here we present a slightly more complex simulation. We now add a sixth movie, Scent of Woman.</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-10_f470d54fc9ea0a79c8240fcc81aeedcf">
-<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">r</span>, <span class="fl">1</span><span class="op">)</span></span>
-<span><span class="co">#&gt;    Godfather Godfather2 Goodfellas You've Got Sleepless Scent</span></span>
-<span><span class="co">#&gt; 1        0.0        0.3        2.2        0.2       0.1  -2.3</span></span>
-<span><span class="co">#&gt; 2        2.0        1.7        0.0       -1.9      -1.7   0.3</span></span>
-<span><span class="co">#&gt; 3        1.9        2.4        0.1       -2.3      -2.0   0.0</span></span>
-<span><span class="co">#&gt; 4       -0.3        0.3        0.3       -0.4      -0.3   0.3</span></span>
-<span><span class="co">#&gt; 5       -0.3       -0.4        0.3        0.2       0.3  -0.3</span></span>
-<span><span class="co">#&gt; 6        0.9        1.1       -0.8       -1.3      -0.8   1.2</span></span>
-<span><span class="co">#&gt; 7        0.9        1.0       -1.2       -1.2      -0.7   0.7</span></span>
-<span><span class="co">#&gt; 8        1.2        1.2       -0.9       -1.0      -0.6   0.8</span></span>
-<span><span class="co">#&gt; 9       -0.7       -1.1       -0.8        1.0       1.4   0.7</span></span>
-<span><span class="co">#&gt; 10      -2.3       -1.8        0.3        1.8       1.7  -0.1</span></span>
-<span><span class="co">#&gt; 11      -1.7       -2.0       -0.1        1.9       2.3   0.2</span></span>
-<span><span class="co">#&gt; 12      -1.8       -1.7       -0.1        2.3       2.0   0.4</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>with <span class="math inline">\(\mathbf{Z}\)</span> the matrix of principal components.</p>
+<p>Let’s perform PCA and examine the results:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-14_ab888cbd5756f354de43ea66efe4e969">
+<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pca</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/prcomp.html">prcomp</a></span><span class="op">(</span><span class="va">e</span>, center <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>By exploring the correlation structure of this new dataset</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-11_6c1d88624c4458068bc15d2570a1137c">
-<pre><code>#&gt;            Godfather Godfather2 Goodfellas    YGM      SS      SW
-#&gt; Godfather      1.000     0.9760    -0.1748 -0.973 -0.9588  0.1299
-#&gt; Godfather2     0.976     1.0000    -0.1051 -0.986 -0.9903  0.0876
-#&gt; Goodfellas    -0.175    -0.1051     1.0000  0.180  0.0801 -0.9426
-#&gt; YGM           -0.973    -0.9864     0.1799  1.000  0.9868 -0.1632
-#&gt; SS            -0.959    -0.9903     0.0801  0.987  1.0000 -0.0817
-#&gt; SW             0.130     0.0876    -0.9426 -0.163 -0.0817  1.0000</code></pre>
+<p>First, notice that the first two PCs explain over 95% of the variability:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-15_c945a80d5feb929d74c1f218ed5b809b">
+<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pca</span><span class="op">$</span><span class="va">sdev</span><span class="op">^</span><span class="fl">2</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">pca</span><span class="op">$</span><span class="va">sdev</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 0.6939 0.1790 0.0402 0.0313 0.0303 0.0253</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>we note that perhaps we need a second factor to account for the fact that some users like Al Pacino, while others dislike him or don’t care. Notice that the overall structure of the correlation obtained from the simulated data is not that far off the real correlation:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-12_940896dbbcdfafb952a08fa931cc78db">
-<pre><code>#&gt;            Godfather Godfather2 Goodfellas    YGM       SS     SW
-#&gt; Godfather      1.000    0.82696      0.438 -0.285 -0.10748  0.362
-#&gt; Godfather2     0.827    1.00000      0.574 -0.268 -0.00675  0.340
-#&gt; Goodfellas     0.438    0.57445      1.000 -0.293 -0.27153  0.278
-#&gt; YGM           -0.285   -0.26767     -0.293  1.000  0.53617 -0.289
-#&gt; SS            -0.107   -0.00675     -0.272  0.536  1.00000 -0.307
-#&gt; SW             0.362    0.34008      0.278 -0.289 -0.30732  1.000</code></pre>
+<p>Next, notice that the first column of <span class="math inline">\(\mathbf{V}\)</span>:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-16_638248ab3ab838b697ba46c7552a494a">
+<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pca</span><span class="op">$</span><span class="va">rotation</span><span class="op">[</span>,<span class="fl">1</span><span class="op">]</span></span>
+<span><span class="co">#&gt;            Godfather          Godfather 2           Goodfellas </span></span>
+<span><span class="co">#&gt;                0.306                0.261                0.581 </span></span>
+<span><span class="co">#&gt;     Scent of a Woman      You've Got Mail Sleepless in Seattle </span></span>
+<span><span class="co">#&gt;               -0.570               -0.294               -0.300</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To explain this more complicated structure, we need two factors. For example something like this:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-13_ea58fddabbcd2f23867f50d003b09252">
-<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">q</span><span class="op">)</span> </span>
-<span><span class="co">#&gt;      Godfather Godfather2 Goodfellas You've Got Sleepless Scent</span></span>
-<span><span class="co">#&gt; [1,]         1          1          1         -1        -1    -1</span></span>
-<span><span class="co">#&gt; [2,]         1          1         -1         -1        -1     1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>is assigning positive values to the mob movies and negative values to the romance movies.</p>
+<p>The second column:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-17_50a2a5ca78ab10ac4b35f917ddd6df00">
+<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pca</span><span class="op">$</span><span class="va">rotation</span><span class="op">[</span>,<span class="fl">2</span><span class="op">]</span></span>
+<span><span class="co">#&gt;            Godfather          Godfather 2           Goodfellas </span></span>
+<span><span class="co">#&gt;               -0.354               -0.377                0.382 </span></span>
+<span><span class="co">#&gt;     Scent of a Woman      You've Got Mail Sleepless in Seattle </span></span>
+<span><span class="co">#&gt;               -0.437                0.448                0.442</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>With the first factor (the first column of <code>q</code>) used to code the gangster versus romance groups and a second factor (the second column of <code>q</code>) to explain the Al Pacino versus no Al Pacino groups. We will also need two sets of coefficients to explain the variability introduced by the <span class="math inline">\(3\times 3\)</span> types of groups:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-14_0271e295cd83042bbe12ed276692f424">
-<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">p</span><span class="op">)</span></span>
-<span><span class="co">#&gt;       1 2 3 4 5 6 7 8  9 10 11 12</span></span>
-<span><span class="co">#&gt; [1,]  1 1 1 0 0 0 0 0 -1 -1 -1 -1</span></span>
-<span><span class="co">#&gt; [2,] -1 1 1 0 0 1 1 1  0 -1 -1 -1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>The model with two factors has 36 parameters that can be used to explain much of the variability in the 72 ratings:</p>
+<p>is coding for Al Pacino movies.</p>
+<p>PCA is automatically finding what we deduced with our knowledge of movies. This is not a coincidence.</p>
+<p>Assume that data <span class="math inline">\(\mathbf{Y}\)</span> follows the model:</p>
 <p><span class="math display">\[
-Y_{u,i} = \mu + b_i + b_u + p_{u,1} q_{1,i} + p_{u,2} q_{2,i} + \varepsilon_{u,i}
+Y_{i,j} = \sum_{k=1}^K p_{i,k}q_{j,k} + \varepsilon_{i,j}
 \]</span></p>
-<p>Note that in an actual data application, we need to fit this model to data. To explain the complex correlation we observe in real data, we usually permit the entries of <span class="math inline">\(p\)</span> and <span class="math inline">\(q\)</span> to be continuous values, rather than discrete ones as we used in the simulation. For example, rather than dividing movies into gangster or romance, we define a continuum. Also note that this is not a linear model and to fit it we need to use an algorithm other than the one used by <code>lm</code> to find the parameters that minimize the least squares. The winning algorithms for the Netflix challenge fit a model similar to the above and used regularization to penalize for large values of <span class="math inline">\(p\)</span> and <span class="math inline">\(q\)</span>, rather than using least squares. Implementing this approach is beyond the scope of this book.</p>
-</section><section id="connection-to-svd-and-pca" class="level2" data-number="23.2"><h2 data-number="23.2" class="anchored" data-anchor-id="connection-to-svd-and-pca">
-<span class="header-section-number">23.2</span> Connection to SVD and PCA</h2>
-<p>The decomposition:</p>
+<p>If we define the matrices <span class="math inline">\(\mathbf{Y}\)</span> and <span class="math inline">\(\boldsymbol{\varepsilon}\)</span> to have <span class="math inline">\(y_{i,j}\)</span> and <span class="math inline">\(\varepsilon_{i,j}\)</span> in the <span class="math inline">\(i\)</span>th row and <span class="math inline">\(j\)</span>th column, respectively, and <span class="math inline">\(\mathbf{P}\)</span> and <span class="math inline">\(\mathbf{Q}\)</span> to have entries <span class="math inline">\(p_{i,k}\)</span> and <span class="math inline">\(q_{i,k}\)</span> in the <span class="math inline">\(i\)</span>th and <span class="math inline">\(k\)</span>th column, respectively, we can rewrite the model as:</p>
 <p><span class="math display">\[
-r_{u,i} \approx p_{u,1} q_{1,i} + p_{u,2} q_{2,i}
+\mathbf{Y} =  \mathbf{P}\mathbf{Q} ^\top + \boldsymbol{\varepsilon}
 \]</span></p>
-<p>is very much related to SVD and PCA. SVD and PCA are complicated concepts, but one way to understand them is that SVD is an algorithm that finds the vectors <span class="math inline">\(p\)</span> and <span class="math inline">\(q\)</span> that permit us to rewrite the matrix <span class="math inline">\(\mbox{r}\)</span> with <span class="math inline">\(m\)</span> rows and <span class="math inline">\(n\)</span> columns as:</p>
+<p>Notice this model is not identifiable since we can multiply the <span class="math inline">\(\mathbf{P}\)</span> by any positive constant and obtain the same model by dividing <span class="math inline">\(\mathbf{Q}\)</span> by this same constant. To avoid this, we impose the constraint that <span class="math inline">\(\mathbf{Q}\)</span> is orthogonal:</p>
 <p><span class="math display">\[
-r_{u,i} = p_{u,1} q_{1,i} + p_{u,2} q_{2,i} + \dots + p_{u,n} q_{n,i}
+\mathbf{Q}^\top\mathbf{Q} = \mathbf{I}
 \]</span></p>
-<p>with the variability of each term decreasing and with the <span class="math inline">\(p\)</span>s uncorrelated. The algorithm also computes this variability so that we can know how much of the matrices, total variability is explained as we add new terms. This may permit us to see that, with just a few terms, we can explain most of the variability. To illustrate this we will only consider a small subset of movies with many ratings and users that have rated many movies:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-15_f298d707fcd05b93f92ac5d81fb3f566">
-<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">keep</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Godfather, The"</span>, <span class="st">"Godfather: Part II, The"</span>, <span class="st">"Goodfellas"</span>, <span class="st">"Ghost"</span>, <span class="st">"Titanic"</span>, </span>
-<span>          <span class="st">"Scent of a Woman"</span><span class="op">)</span></span>
-<span><span class="va">dat</span> <span class="op">&lt;-</span> <span class="va">movielens</span>  <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">group_by</span><span class="op">(</span><span class="va">userId</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/stats/filter.html">filter</a></span><span class="op">(</span><span class="fu">n</span><span class="op">(</span><span class="op">)</span> <span class="op">&gt;=</span> <span class="fl">250</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">ungroup</span><span class="op">(</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu">group_by</span><span class="op">(</span><span class="va">movieId</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/stats/filter.html">filter</a></span><span class="op">(</span><span class="fu">n</span><span class="op">(</span><span class="op">)</span> <span class="op">&gt;=</span> <span class="fl">50</span> <span class="op">|</span> <span class="va">title</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">keep</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">ungroup</span><span class="op">(</span><span class="op">)</span> </span>
-<span></span>
-<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fu">select</span><span class="op">(</span><span class="va">dat</span>, <span class="va">movieId</span>, <span class="va">userId</span>, <span class="va">rating</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu">pivot_wider</span><span class="op">(</span>names_from <span class="op">=</span> <span class="va">movieId</span>, values_from <span class="op">=</span> <span class="va">rating</span><span class="op">)</span> </span>
-<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">as.matrix</a></span><span class="op">(</span><span class="va">y</span><span class="op">[</span>,<span class="op">-</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span></span>
-<span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu">select</span><span class="op">(</span><span class="va">movieId</span>, <span class="va">title</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">distinct</span><span class="op">(</span><span class="va">movieId</span>, .keep_all <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu">right_join</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>movieId <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/integer.html">as.integer</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span><span class="op">)</span>, by <span class="op">=</span> <span class="st">"movieId"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu">pull</span><span class="op">(</span><span class="va">title</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>We first remove the overall movie and user effects as we are interested in the variability not explained by these. We start by removing the movie effects:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-16_6ee646bb86e539ff4cc3eb98a3071b96">
-<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">r</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">y</span>, <span class="fl">2</span>, <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colMeans</a></span><span class="op">(</span><span class="va">y</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>The first <span class="math inline">\(K\)</span> columns of the principal components and associated rotation provide estimates of <span class="math inline">\(\mathbf{P}\)</span> and <span class="math inline">\(\mathbf{Q}\)</span> respectively.</p>
+</section><section id="case-study-movie-recommendations" class="level2" data-number="24.3"><h2 data-number="24.3" class="anchored" data-anchor-id="case-study-movie-recommendations">
+<span class="header-section-number">24.3</span> Case study: movie recommendations</h2>
+<p>Note that if we look at the correlation structure of the movies for which we simulated data in the previous sections, we see structure as well:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-18_59a9c6e6a6267217b58aacbcbf98c407">
+<pre><code>#&gt;                         Godfather, The Godfather: Part II, The
+#&gt; Godfather, The                   1.000                   0.842
+#&gt; Godfather: Part II, The          0.842                   1.000
+#&gt; Goodfellas                       0.521                   0.507
+#&gt; Scent of a Woman                 0.323                   0.209
+#&gt; You've Got Mail                 -0.405                  -0.213
+#&gt; Sleepless in Seattle            -0.334                  -0.295
+#&gt;                         Goodfellas Scent of a Woman You've Got Mail
+#&gt; Godfather, The              0.5208           0.3231          -0.405
+#&gt; Godfather: Part II, The     0.5065           0.2091          -0.213
+#&gt; Goodfellas                  1.0000          -0.0277          -0.254
+#&gt; Scent of a Woman           -0.0277           1.0000          -0.312
+#&gt; You've Got Mail            -0.2542          -0.3119           1.000
+#&gt; Sleepless in Seattle       -0.4484          -0.4405           0.455
+#&gt;                         Sleepless in Seattle
+#&gt; Godfather, The                        -0.334
+#&gt; Godfather: Part II, The               -0.295
+#&gt; Goodfellas                            -0.448
+#&gt; Scent of a Woman                      -0.441
+#&gt; You've Got Mail                        0.455
+#&gt; Sleepless in Seattle                   1.000</code></pre>
 </div>
-<p>Because for the techniques shown here we can’t have missing values we need to replace the missing ratings. There are advanced techniques for doing this, some are explained in the description of the winning entry for the Netflix competition. Here we will use a simple approach: replace with a constant. Now because an unrated movie is more likely to be a movie the user does not want to see, we will replace the missing ratings with -1 rather than a 0, which represents a neutral rating.</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-17_8f31001f29ebd0aef6642c52383e7817">
-<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">r</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">r</span><span class="op">)</span><span class="op">]</span> <span class="op">&lt;-</span> <span class="op">-</span><span class="fl">1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Finally we will remove the overall user effect:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-18_4e6237f9f2dca122162596ea620a7b8e">
-<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">r</span> <span class="op">&lt;-</span> <span class="va">r</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="va">r</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>This implies that we should be able to improve the predictions made in the previous chapter if we use this information. Ratings on <em>The Godfather</em> should inform ratings for the <em>The Godfather II</em>, for example. But what other patterns might be useful for prediction?</p>
+<p>We will rewrite the model from the previous chapter to include factors to explain similarities between movies:</p>
+<p><span class="math display">\[
+Y_{i,j} = \mu + \alpha_i + \beta_j + \sum_{k=1}^K p_{i,k}q_{j,k} +\varepsilon_{i,j}
+\]</span></p>
+<p>Unfortunately, we can’t fit this model with <code>prcomp</code> due to the missing values. We introduce the <strong>missMDA</strong> package that provides an approach to fit such models when matrix entries are missing, a very common occurrence in movie recommendations, through the function <code>imputePCA</code>. Also, because there are small sample sizes for several movie pairs, it is useful to regularize the <span class="math inline">\(p\)</span>s. The <code>imputePCA</code> function also permits regularization.</p>
+<p>We use the estimates for <span class="math inline">\(\mu\)</span>, the <span class="math inline">\(\alpha\)</span>s and <span class="math inline">\(\beta\)</span>s from the previous chapter, and estimate two factors (<code>ncp = 2</code>). We fit the model to movies rated more than 25 times, include <em>Scent of a Woman</em>, which does not meet this criterion, because we previously used it as an example. Finally, we use regularization by setting the parameter <code>coeff.ridge</code> to the same value used to estimate the <span class="math inline">\(\beta\)</span>s.</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-19_12c36da05923ee19dd5e6057c6653a72">
+<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="http://factominer.free.fr/missMDA/index.html">missMDA</a></span><span class="op">)</span></span>
+<span><span class="va">ind</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colSums</a></span><span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">&gt;=</span> <span class="fl">25</span> <span class="op">|</span> <span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">==</span> <span class="st">"3252"</span></span>
+<span><span class="va">imputed</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/missMDA/man/imputePCA.html">imputePCA</a></span><span class="op">(</span><span class="va">r</span><span class="op">[</span>,<span class="va">ind</span><span class="op">]</span>, ncp <span class="op">=</span> <span class="fl">2</span>, coeff.ridge <span class="op">=</span> <span class="va">lambda</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Now we can perform principal component analysis:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-19_dddec4675e26530078fe75a3419ca813">
-<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pca</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/prcomp.html">prcomp</a></span><span class="op">(</span><span class="va">r</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>To see how much we improve our previous prediction, we construct a matrix with the ratings in the test set:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-20_721c1c75df4dc1a87182161033a6e81d">
+<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_test</span> <span class="op">&lt;-</span> <span class="fu">select</span><span class="op">(</span><span class="va">test_set</span>, <span class="va">movieId</span>, <span class="va">userId</span>, <span class="va">rating</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu">pivot_wider</span><span class="op">(</span>names_from <span class="op">=</span> <span class="va">movieId</span>, values_from <span class="op">=</span> <span class="va">rating</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu">column_to_rownames</span><span class="op">(</span><span class="st">"userId"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">as.matrix</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The <span class="math inline">\(q\)</span> vectors are called the principal components and they are stored in this matrix:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-20_89c85f956225e49282864fbf80c46585">
-<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">pca</span><span class="op">$</span><span class="va">rotation</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 138 105</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>and construct our predictor obtained with <code>imputePCA</code>. We start by constructing the predictor from the previous chapter:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-21_e77bb4d30a5006eb075bec5f048f86a2">
+<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pred</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">rownames</a></span><span class="op">(</span><span class="va">pred</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colnames.html">rownames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span>; <span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">pred</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span></span>
+<span><span class="va">pred</span> <span class="op">&lt;-</span> <span class="fu">clamp</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">pred</span> <span class="op">+</span> <span class="va">mu</span> <span class="op">+</span> <span class="va">a</span>, <span class="fl">2</span>, <span class="va">b_reg</span>, FUN <span class="op">=</span> <span class="st">"+"</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="fu">rmse</span><span class="op">(</span><span class="va">y_test</span> <span class="op">-</span> <span class="va">pred</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">rownames</a></span><span class="op">(</span><span class="va">y_test</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">y_test</span><span class="op">)</span><span class="op">]</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 0.889</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>While the <span class="math inline">\(p\)</span>, or the user effects, are here:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-21_b4bc11c5e227a9a0ba6a95738c5ebff4">
-<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">pca</span><span class="op">$</span><span class="va">x</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 105 105</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Then we adjust the prediction to include the imputed residuals for the test set:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-22_cc7e1a6371f810dde5ef052fb9e2e8bb">
+<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pred</span><span class="op">[</span>,<span class="va">ind</span><span class="op">]</span> <span class="op">&lt;-</span> <span class="fu">clamp</span><span class="op">(</span><span class="va">pred</span><span class="op">[</span>,<span class="va">ind</span><span class="op">]</span> <span class="op">+</span> <span class="va">imputed</span><span class="op">$</span><span class="va">completeObs</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can see the variability of each of the vectors:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/pca-sds_c767791f2512be395b04428dfbe1335f">
-<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">qplot</span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">pca</span><span class="op">$</span><span class="va">x</span><span class="op">)</span>, <span class="va">pca</span><span class="op">$</span><span class="va">sdev</span>, xlab <span class="op">=</span> <span class="st">"PC"</span><span class="op">)</span></span>
-<span><span class="co">#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="matrix-factorization_files/figure-html/pca-sds-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
+<p>We see that our prediction improves:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-23_8da0aaf2237eaf4091aba03fe0247a07">
+<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">rmse</span><span class="op">(</span><span class="va">y_test</span> <span class="op">-</span> <span class="va">pred</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">rownames</a></span><span class="op">(</span><span class="va">y_test</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">y_test</span><span class="op">)</span><span class="op">]</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 0.875</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
+<p>We note that further improvements can be obtained by 1) optimizing the regularization penalty, 2) considering more than 2 factors, and 3) accounting for the fact that a missing rating provides information: people tend to not watch movies they know they won’t like.</p>
+<section id="visualizing-factors" class="level3" data-number="24.3.1"><h3 data-number="24.3.1" class="anchored" data-anchor-id="visualizing-factors">
+<span class="header-section-number">24.3.1</span> Visualizing factors</h3>
+<p>We can compute the first two principal components used for the prediction using <code>prcomp</code>.</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-24_17bc74ca9ae733086a808ec3d78c87bf">
+<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pca</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/prcomp.html">prcomp</a></span><span class="op">(</span><span class="va">imputed</span><span class="op">$</span><span class="va">completeObs</span>, center <span class="op">=</span> <span class="cn">FALSE</span>, rank. <span class="op">=</span> <span class="fl">3</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
+<p>By adding the movie names to the rotation matrix:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-25_a1ae975c258d5a36db3193917ea7ebb4">
+<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">v</span> <span class="op">&lt;-</span> <span class="va">pca</span><span class="op">$</span><span class="va">rotation</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">rownames</a></span><span class="op">(</span><span class="va">v</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">movie_map</span>, <span class="va">title</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/match.html">match</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">r</span><span class="op">[</span>,<span class="va">ind</span><span class="op">]</span><span class="op">)</span>, <span class="va">movieId</span><span class="op">)</span><span class="op">]</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We also notice that the first two principal components are related to the structure in opinions about movies:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/movies-pca_ca6ebef5bfccde20fb943f4a1f929d82">
+<p>and visually explore the results, we see that the mob movies discussed above are close to each other, as are the romance movies without Al Pacino.</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/movies-pca_d7cf5307431722573914e8435d148380">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="matrix-factorization_files/figure-html/movies-pca-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -639,49 +699,51 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Just by looking at the top 10 in each direction, we see a meaningful patterns. The first PC shows the difference between Hollywood blockbusters on one side:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-22_0659b4ef5debe51136c04619c69c34e4">
-<pre><code>#&gt;  [1] "Independence Day (a.k.a. ID4)"  "Armageddon"                    
-#&gt;  [3] "Spider-Man"                     "Mummy, The"                    
-#&gt;  [5] "Aladdin"                        "Lion King, The"                
-#&gt;  [7] "Harry Potter and the Sorcer..." "Twister"                       
-#&gt;  [9] "X-Men"                          "Lord of the Rings: The Retu..."</code></pre>
+<p>By looking at the highest and lowest values for the first principal component, we see a meaningful pattern. The first PC shows the difference between Hollywood blockbusters on one side:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-26_1da0045353debdbd27527cdbad64f4d3">
+<pre><code>#&gt;  [1] "Armageddon"                    "Pearl Harbor"                 
+#&gt;  [3] "X2: X-Men United"              "Dark Knight Rises, The"       
+#&gt;  [5] "X-Men"                         "Independence Day (a.k.a. ID4)"
+#&gt;  [7] "Con Air"                       "I, Robot"                     
+#&gt;  [9] "World Is Not Enough, The"      "Spider-Man 2"</code></pre>
 </div>
 <p>and critically acclaimed movies on the other:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-23_4052521db06852d42e497556f47f67c5">
-<pre><code>#&gt;  [1] "2001: A Space Odyssey"          "Apocalypse Now"                
-#&gt;  [3] "Fargo"                          "Being John Malkovich"          
-#&gt;  [5] "One Flew Over the Cuckoo's ..." "Clockwork Orange, A"           
-#&gt;  [7] "Blade Runner"                   "Shining, The"                  
-#&gt;  [9] "Godfather, The"                 "Big Lebowski, The"</code></pre>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-27_7a85fcfd571756c3f9025d9af58414ba">
+<pre><code>#&gt;  [1] "2001: A Space Odyssey"          "American Psycho"               
+#&gt;  [3] "Royal Tenenbaums, The"          "Harold and Maude"              
+#&gt;  [5] "Apocalypse Now"                 "Fear and Loathing in Las Vegas"
+#&gt;  [7] "Mulholland Drive"               "Clockwork Orange, A"           
+#&gt;  [9] "English Patient, The"           "Dr. Strangelove or: How I L..."</code></pre>
 </div>
-<p>While the second PC seems to be related to nerd favorites or violent movies on one side</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-24_00e8b17b109de9300474ee11b2f683f4">
-<pre><code>#&gt;  [1] "Fight Club"                     "Lord of the Rings: The Two ..."
-#&gt;  [3] "Lord of the Rings: The Retu..." "Matrix, The"                   
-#&gt;  [5] "X-Men"                          "Lord of the Rings: The Fell..."
-#&gt;  [7] "Kill Bill: Vol. 2"              "Léon: The Professional (a.k..."
-#&gt;  [9] "Kill Bill: Vol. 1"              "Memento"</code></pre>
-</div>
-<p>and romantic movies on the other:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-25_dfc48644881df56fe38360c7ad4a5b20">
-<pre><code>#&gt;  [1] "Babe"                 "Grease"              
-#&gt;  [3] "Sleepless in Seattle" "Beauty and the Beast"
-#&gt;  [5] "Ghost"                "Jerry Maguire"       
-#&gt;  [7] "Pretty Woman"         "Titanic"             
-#&gt;  [9] "Aladdin"              "Big"</code></pre>
+</section></section><section id="singular-value-decomposition" class="level2" data-number="24.4"><h2 data-number="24.4" class="anchored" data-anchor-id="singular-value-decomposition">
+<span class="header-section-number">24.4</span> Singular Value Decomposition</h2>
+<p>The analysis performed here with PCA is often performed with a related technique called the Singular Value Decomposition (SVD). The SVD theorem states that any <span class="math inline">\(N\times p\)</span> matrix can be written as:</p>
+<p><span class="math display">\[
+\mathbf{Y} = \mathbf{U}\mathbf{D}\mathbf{V}^\top
+\]</span> With <span class="math inline">\(\mathbf{U}\)</span> and orthogonal <span class="math inline">\(N\times p\)</span> matrix, <span class="math inline">\(\mathbf{V}\)</span> an orthogonal <span class="math inline">\(p \times p\)</span> matrix, and <span class="math inline">\(\mathbf{D}\)</span> a diagonal matrix with <span class="math inline">\(d_{1,1} \geq d_{2,2} \geq \dots \geq d_{p,p}\)</span>. SVD is related to PCA because we can show that <span class="math inline">\(\mathbf{V}\)</span> is the rotation that gives us principal components. This implies that <span class="math inline">\(\mathbf{U}\mathbf{D}\)</span> are the principal components. This also implies that if you square the diagonal entries of <span class="math inline">\(\mathbf{D}\)</span>, you obtain the sum of squares of the principal components since:</p>
+<p><span class="math display">\[
+\mathbf{U}^\top\mathbf{D}\mathbf{U} = \mathbf{D}^2
+\]</span></p>
+<p>In R, we can obtain the SVD using the function <code>svd</code>. To see the connection to PCA, notice that:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-28_a9e9a553b24f04791c84cd83650df452">
+<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="fl">1000</span><span class="op">)</span>, <span class="fl">100</span>, <span class="fl">10</span><span class="op">)</span></span>
+<span><span class="va">pca</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/prcomp.html">prcomp</a></span><span class="op">(</span><span class="va">x</span>, center <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
+<span><span class="va">s</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/svd.html">svd</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
+<span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/all.equal.html">all.equal</a></span><span class="op">(</span><span class="va">pca</span><span class="op">$</span><span class="va">rotation</span>, <span class="va">s</span><span class="op">$</span><span class="va">v</span>, check.attributes <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] TRUE</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/all.equal.html">all.equal</a></span><span class="op">(</span><span class="va">pca</span><span class="op">$</span><span class="va">sdev</span><span class="op">^</span><span class="fl">2</span>, <span class="va">s</span><span class="op">$</span><span class="va">d</span><span class="op">^</span><span class="fl">2</span><span class="op">/</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">-</span> <span class="fl">1</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] TRUE</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/all.equal.html">all.equal</a></span><span class="op">(</span><span class="va">pca</span><span class="op">$</span><span class="va">x</span>, <span class="va">s</span><span class="op">$</span><span class="va">u</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/diag.html">diag</a></span><span class="op">(</span><span class="va">s</span><span class="op">$</span><span class="va">d</span><span class="op">)</span>, check.attributes <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] TRUE</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Fitting a model that incorporates these estimates is complicated. For those interested in implementing an approach that incorporates these ideas, we recommend trying the <strong>recommenderlab</strong> package. The details are beyond the scope of this book.</p>
-</section><section id="exercises" class="level2" data-number="23.3"><h2 data-number="23.3" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">23.3</span> Exercises</h2>
-<p>In this exercise set, we will be covering a topic useful for understanding matrix factorization: the singular value decomposition (SVD). SVD is a mathematical result that is widely used in machine learning, both in practice and to understand the mathematical properties of some algorithms. This is a rather advanced topic and to complete this exercise set you will have to be familiar with linear algebra concepts such as matrix multiplication, orthogonal matrices, and diagonal matrices.</p>
-<p>The SVD tells us that we can <em>decompose</em> an <span class="math inline">\(N\times p\)</span> matrix <span class="math inline">\(Y\)</span> with <span class="math inline">\(p &lt; N\)</span> as</p>
-<p><span class="math display">\[ Y = U D V^{\top} \]</span></p>
-<p>With <span class="math inline">\(U\)</span> and <span class="math inline">\(V\)</span> <em>orthogonal</em> of dimensions <span class="math inline">\(N\times p\)</span> and <span class="math inline">\(p\times p\)</span>, respectively, and <span class="math inline">\(D\)</span> a <span class="math inline">\(p \times p\)</span> <em>diagonal</em> matrix with the values of the diagonal decreasing:</p>
-<p><span class="math display">\[d_{1,1} \geq d_{2,2} \geq \dots d_{p,p}.\]</span></p>
-<p>In this exercise, we will see one of the ways that this decomposition can be useful. To do this, we will construct a dataset that represents grade scores for 100 students in 24 different subjects. The overall average has been removed so this data represents the percentage point each student received above or below the average test score. So a 0 represents an average grade (C), a 25 is a high grade (A+), and a -25 represents a low grade (F). You can simulate the data like this:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-26_91349496b24a72a59f920c4c0d61e272">
-<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1987</span><span class="op">)</span></span>
+<p>In the exercises, we show that <code>s$u %*% diag(s$d)</code> can be computed more efficiently as <code>sweep(s$u, 2, s$d, "*")</code>.</p>
+</section><section id="exercises" class="level2" data-number="24.5"><h2 data-number="24.5" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">24.5</span> Exercises</h2>
+<p>In this exercise set, we use the singular value decomposition (SVD) to estimate factors in an example related to the first application of factor analysis: finding factors related to student performance in school.</p>
+<p>We construct a dataset that represents grade scores for 100 students in 24 different subjects. The overall average has been removed so this data represents the percentage points each student received above or below the average test score. So a 0 represents an average grade (C), a 25 is a high grade (A+), and a -25 represents a low grade (F). You can simulate the data like this:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-29_6437723707489fd8319c06a46c596045">
+<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1987</span><span class="op">)</span></span>
 <span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">100</span></span>
 <span><span class="va">k</span> <span class="op">&lt;-</span> <span class="fl">8</span></span>
 <span><span class="va">Sigma</span> <span class="op">&lt;-</span> <span class="fl">64</span>  <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">.75</span>, <span class="fl">.5</span>, <span class="fl">.75</span>, <span class="fl">1</span>, <span class="fl">.5</span>, <span class="fl">.5</span>, <span class="fl">.5</span>, <span class="fl">1</span><span class="op">)</span>, <span class="fl">3</span>, <span class="fl">3</span><span class="op">)</span> </span>
@@ -693,10 +755,10 @@ <h1 class="title">
 <span>                 <span class="fu"><a href="https://rdrr.io/r/base/paste.html">paste</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/rep.html">rep</a></span><span class="op">(</span><span class="st">"Science"</span>,<span class="va">k</span><span class="op">)</span>, <span class="fl">1</span><span class="op">:</span><span class="va">k</span>, sep<span class="op">=</span><span class="st">"_"</span><span class="op">)</span>,</span>
 <span>                 <span class="fu"><a href="https://rdrr.io/r/base/paste.html">paste</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/rep.html">rep</a></span><span class="op">(</span><span class="st">"Arts"</span>,<span class="va">k</span><span class="op">)</span>, <span class="fl">1</span><span class="op">:</span><span class="va">k</span>, sep<span class="op">=</span><span class="st">"_"</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Our goal is to describe the student performances as succinctly as possible. For example, we want to know if these test results are all just random independent numbers. Are all students just about as good? Does being good in one subject imply you will be good in another? How does the SVD help with all this? We will go step by step to show that with just three relatively small pairs of vectors we can explain much of the variability in this <span class="math inline">\(100 \times 24\)</span> dataset.</p>
+<p>Our goal is to describe the student performances as succinctly as possible. For example, we want to know if these test results are all simply random independent numbers. Are all students just about as good? Does being good in one subject imply one will be good in another? How does the SVD help with all this? We will go step by step to show that with just three relatively small pairs of vectors, we can explain much of the variability in this <span class="math inline">\(100 \times 24\)</span> dataset.</p>
 <p>You can visualize the 24 test scores for the 100 students by plotting an image:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-27_d14d45ceba0fb5ec408c2b06d79efd62">
-<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">my_image</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">x</span>, <span class="va">zlim</span> <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/range.html">range</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, <span class="va">...</span><span class="op">)</span><span class="op">{</span></span>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-30_064402a365f29bd1b4904fca5eb654df">
+<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">my_image</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">x</span>, <span class="va">zlim</span> <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/range.html">range</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, <span class="va">...</span><span class="op">)</span><span class="op">{</span></span>
 <span>  <span class="va">colors</span> <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/rev.html">rev</a></span><span class="op">(</span><span class="fu">RColorBrewer</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/RColorBrewer/man/ColorBrewer.html">brewer.pal</a></span><span class="op">(</span><span class="fl">9</span>, <span class="st">"RdBu"</span><span class="op">)</span><span class="op">)</span></span>
 <span>  <span class="va">cols</span> <span class="op">&lt;-</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span>  <span class="va">rows</span> <span class="op">&lt;-</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
@@ -711,95 +773,106 @@ <h1 class="title">
 <p>1. How would you describe the data based on this figure?</p>
 <ol type="a">
 <li>The test scores are all independent of each other.</li>
-<li>The students that test well are at the top of the image and there seem to be three groupings by subject.</li>
+<li>The students that test well are at the top of the image and there seems to be three groupings by subject.</li>
 <li>The students that are good at math are not good at science.</li>
 <li>The students that are good at math are not good at humanities.</li>
 </ol>
 <p>2. You can examine the correlation between the test scores directly like this:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-28_e3ef4cd6bfb2c82a4faa684d4e64a403">
-<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">my_image</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span>, zlim <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>,<span class="fl">1</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-31_d1c1ae450bc22b4225461e81eaea7a8e">
+<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">my_image</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span>, zlim <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>,<span class="fl">1</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/range.html">range</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/axis.html">axis</a></span><span class="op">(</span>side <span class="op">=</span> <span class="fl">2</span>, <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/rev.html">rev</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span>, las <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Which of the following best describes what you see?</p>
 <ol type="a">
 <li>The test scores are independent.</li>
-<li>Math and science are highly correlated but the humanities are not.</li>
-<li>There is high correlation between tests in the same subject but no correlation across subjects.</li>
+<li>Math and science are highly correlated, but the humanities are not.</li>
+<li>There is high correlation between tests in the same subject, but no correlation across subjects.</li>
 <li>There is a correlation among all tests, but higher if the tests are in science and math and even higher within each subject.</li>
 </ol>
-<p>3. Remember that orthogonality means that <span class="math inline">\(U^{\top}U\)</span> and <span class="math inline">\(V^{\top}V\)</span> are equal to the identity matrix. This implies that we can also rewrite the decomposition as</p>
-<p><span class="math display">\[ Y V = U D \mbox{ or } U^{\top}Y = D V^{\top}\]</span></p>
-<p>We can think of <span class="math inline">\(YV\)</span> and <span class="math inline">\(U^{\top}V\)</span> as two transformations of Y that preserve the total variability of <span class="math inline">\(Y\)</span> since <span class="math inline">\(U\)</span> and <span class="math inline">\(V\)</span> are orthogonal.</p>
+<p>3. Remember that orthogonality means that <span class="math inline">\(U^{\top}U\)</span> and <span class="math inline">\(V^{\top}V\)</span> are equal to the identity matrix. This implies that we can also rewrite the decomposition as:</p>
+<p><span class="math display">\[ \mathbf{Y V} = \mathbf{U D} \mbox{ or } \mathbf{U}^{\top}\mathbf{Y} = \mathbf{D V}^{\top}\]</span></p>
+<p>We can think of <span class="math inline">\(\mathbf{YV}\)</span> and <span class="math inline">\(\mathbf{U}^{\top}\mathbf{V}\)</span> as two transformations of <span class="math inline">\(\mathbf{Y}\)</span> that preserve the total variability.</p>
 <p>Use the function <code>svd</code> to compute the SVD of <code>y</code>. This function will return <span class="math inline">\(U\)</span>, <span class="math inline">\(V\)</span> and the diagonal entries of <span class="math inline">\(D\)</span>.</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-29_afb710b94f4b8348aa0a1db0f0477fcc">
-<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">s</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/svd.html">svd</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-32_f1faec422db2701fc53765c1228c3be5">
+<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">s</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/svd.html">svd</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">s</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>You can check that the SVD works by typing:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-30_8b2350899845705a85ab8afed946703c">
-<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_svd</span> <span class="op">&lt;-</span> <span class="va">s</span><span class="op">$</span><span class="va">u</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/diag.html">diag</a></span><span class="op">(</span><span class="va">s</span><span class="op">$</span><span class="va">d</span><span class="op">)</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">s</span><span class="op">$</span><span class="va">v</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-33_84a48f72b776a680e24ce4d4834a941f">
+<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_svd</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">s</span><span class="op">$</span><span class="va">u</span>, <span class="va">d</span><span class="op">)</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">s</span><span class="op">$</span><span class="va">v</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">abs</a></span><span class="op">(</span><span class="va">y</span> <span class="op">-</span> <span class="va">y_svd</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Compute the sum of squares of the columns of <span class="math inline">\(Y\)</span> and store them in <code>ss_y</code>. Then compute the sum of squares of columns of the transformed <span class="math inline">\(YV\)</span> and store them in <code>ss_yv</code>. Confirm that <code>sum(ss_y)</code> is equal to <code>sum(ss_yv)</code>.</p>
-<p>4. We see that the total sum of squares is preserved. This is because <span class="math inline">\(V\)</span> is orthogonal. Now to start understanding how <span class="math inline">\(YV\)</span> is useful, plot <code>ss_y</code> against the column number and then do the same for <code>ss_yv</code>. What do you observe?</p>
-<p>5. We see that the variability of the columns of <span class="math inline">\(YV\)</span> is decreasing. Furthermore, we see that, relative to the first three, the variability of the columns beyond the third is almost 0. Now notice that we didn’t have to compute <code>ss_yv</code> because we already have the answer. How? Remember that <span class="math inline">\(YV = UD\)</span> and because <span class="math inline">\(U\)</span> is orthogonal, we know that the sum of squares of the columns of <span class="math inline">\(UD\)</span> are the diagonal entries of <span class="math inline">\(D\)</span> squared. Confirm this by plotting the square root of <code>ss_yv</code> versus the diagonal entries of <span class="math inline">\(D\)</span>.</p>
-<p>6. From the above we know that the sum of squares of the columns of <span class="math inline">\(Y\)</span> (the total sum of squares) add up to the sum of <code>s$d^2</code> and that the transformation <span class="math inline">\(YV\)</span> gives us columns with sums of squares equal to <code>s$d^2</code>. Now compute what percent of the total variability is explained by just the first three columns of <span class="math inline">\(YV\)</span>.</p>
-<p>7. We see that almost 99% of the variability is explained by the first three columns of <span class="math inline">\(YV = UD\)</span>. So we get the sense that we should be able to explain much of the variability and structure we found while exploring the data with a few columns. Before we continue, let’s show a useful computational trick to avoid creating the matrix <code>diag(s$d)</code>. To motivate this, we note that if we write <span class="math inline">\(U\)</span> out in its columns <span class="math inline">\([U_1, U_2, \dots, U_p]\)</span> then <span class="math inline">\(UD\)</span> is equal to</p>
-<p><span class="math display">\[UD = [U_1 d_{1,1}, U_2 d_{2,2}, \dots, U_p d_{p,p}]\]</span></p>
-<p>Use the <code>sweep</code> function to compute <span class="math inline">\(UD\)</span> without constructing <code>diag(s$d)</code> nor matrix multiplication.</p>
-<p>8. We know that <span class="math inline">\(U_1 d_{1,1}\)</span>, the first column of <span class="math inline">\(UD\)</span>, has the most variability of all the columns of <span class="math inline">\(UD\)</span>. Earlier we saw an image of <span class="math inline">\(Y\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-31_7d30212608b4b00ba25aecff30007f0d">
-<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">my_image</span><span class="op">(</span><span class="va">y</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Compute the sum of squares of the columns of <span class="math inline">\(Y\)</span> and store them in <code>ss_y</code>. Then compute the sum of squares of columns of the transformed <span class="math inline">\(\mathbf{YV}\)</span> and store them in <code>ss_yv</code>. Confirm that <code>sum(ss_y)</code> is equal to <code>sum(ss_yv)</code>.</p>
+<p>4. We see that the total sum of squares is preserved. This is because <span class="math inline">\(\mathbf{V}\)</span> is orthogonal. Now to start understanding how <span class="math inline">\(\mathbf{YV}\)</span> is useful, plot <code>ss_y</code> against the column number and then do the same for <code>ss_yv</code>. What do you observe?</p>
+<p>5. We see that the variability of the columns of <span class="math inline">\(\mathbf{YV}\)</span> is decreasing. Furthermore, we see that, relative to the first three, the variability of the columns beyond the third is almost 0. Now notice that we didn’t have to compute <code>ss_yv</code> because we already have the answer. How? Remember that <span class="math inline">\(\mathbf{YV} = \mathbf{UD}\)</span> and because <span class="math inline">\(\mathbf{U}\)</span> is orthogonal, we know that the sum of squares of the columns of <span class="math inline">\(\mathbf{UD}\)</span> are the diagonal entries of <span class="math inline">\(\mathbf{D}\)</span> squared. Confirm this by plotting the square root of <code>ss_yv</code> versus the diagonal entries of <span class="math inline">\(\mathbf{D}\)</span>.</p>
+<p>6. From the above we know that the sum of squares of the columns of <span class="math inline">\(\mathbf{Y}\)</span> (the total sum of squares) add up to the sum of <code>s$d^2</code>, and that the transformation <span class="math inline">\(\mathbf{YV}\)</span> gives us columns with sums of squares equal to <code>s$d^2</code>. Now compute what percent of the total variability is explained by just the first three columns of <span class="math inline">\(\mathbf{YV}\)</span>.</p>
+<p>7. We see that almost 99% of the variability is explained by the first three columns of <span class="math inline">\(\mathbf{YV} = \mathbf{UD}\)</span>. So we get the sense that we should be able to explain much of the variability and structure we found while exploring the data with a few columns. Before we continue, let’s show a useful computational trick to avoid creating the matrix <code>diag(s$d)</code>. To motivate this, we note that if we write <span class="math inline">\(\mathbf{U}\)</span> out in its columns <span class="math inline">\([\mathbf{u}_1, \mathbf{u}_2, \dots, \mathbf{u}_p]\)</span>, then <span class="math inline">\(\mathbf{UD}\)</span> is equal to:</p>
+<p><span class="math display">\[
+\mathbf{UD} = [\mathbf{u}_1 d_{1,1}, \mathbf{u}_2 d_{2,2}, \dots, \mathbf{u}_p d_{p,p}]
+\]</span></p>
+<p>Use the <code>sweep</code> function to compute <span class="math inline">\(UD\)</span> without constructing <code>diag(s$d)</code> and without using matrix multiplication.</p>
+<p>8. We know that <span class="math inline">\(\mathbf{u}_1 d_{1,1}\)</span>, the first column of <span class="math inline">\(\mathbf{UD}\)</span>, has the most variability of all the columns of <span class="math inline">\(\mathbf{UD}\)</span>. Earlier we saw an image of <span class="math inline">\(Y\)</span>:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-34_6c0be4d95bd79df54e88a6d19ad9a3c6">
+<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">my_image</span><span class="op">(</span><span class="va">y</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>in which we can see that the student to student variability is quite large and that it appears that students that are good in one subject are good in all. This implies that the average (across all subjects) for each student should explain a lot of the variability. Compute the average score for each student and plot it against <span class="math inline">\(U_1 d_{1,1}\)</span> and describe what you find.</p>
+<p>in which we can see that the student to student variability is quite large and that it appears that students that are good in one subject are good in all. This implies that the average (across all subjects) for each student should explain a lot of the variability. Compute the average score for each student and plot it against <span class="math inline">\(\mathbf{u}_1 d_{1,1}\)</span>, and describe what you find.</p>
 <p>9. We note that the signs in SVD are arbitrary because:</p>
-<p><span class="math display">\[ U D V^{\top} = (-U) D (-V)^{\top} \]</span></p>
-<p>With this in mind we see that the first column of <span class="math inline">\(UD\)</span> is almost identical to the average score for each student except for the sign.</p>
-<p>This implies that multiplying <span class="math inline">\(Y\)</span> by the first column of <span class="math inline">\(V\)</span> must be performing a similar operation to taking the average. Make an image plot of <span class="math inline">\(V\)</span> and describe the first column relative to others and how this relates to taking an average.</p>
-<p>10. We already saw that we can rewrite <span class="math inline">\(UD\)</span> as</p>
-<p><span class="math display">\[U_1 d_{1,1} + U_2 d_{2,2} + \dots + U_p d_{p,p}\]</span></p>
-<p>with <span class="math inline">\(U_j\)</span> the j-th column of <span class="math inline">\(U\)</span>. This implies that we can rewrite the entire SVD as:</p>
-<p><span class="math display">\[Y = U_1 d_{1,1} V_1 ^{\top} + U_2 d_{2,2} V_2 ^{\top} + \dots + U_p d_{p,p} V_p ^{\top}\]</span></p>
-<p>with <span class="math inline">\(V_j\)</span> the jth column of <span class="math inline">\(V\)</span>. Plot <span class="math inline">\(U_1\)</span>, then plot <span class="math inline">\(V_1^{\top}\)</span> using the same range for the y-axis limits, then make an image of <span class="math inline">\(U_1 d_{1,1} V_1 ^{\top}\)</span> and compare it to the image of <span class="math inline">\(Y\)</span>. Hint: use the <code>my_image</code> function defined above and use the <code>drop=FALSE</code> argument to assure the subsets of matrices are matrices.</p>
+<p><span class="math display">\[ \mathbf{U D V}^{\top} = (-\mathbf{U}) D (-\mathbf{V})^{\top} \]</span></p>
+<p>With this in mind, we see that the first column of <span class="math inline">\(\mathbf{UD}\)</span> is almost identical to the average score for each student except for the sign.</p>
+<p>This implies that multiplying <span class="math inline">\(\mathbf{Y}\)</span> by the first column of <span class="math inline">\(\mathbf{V}\)</span> must be performing a similar operation to taking the average. Make an image plot of <span class="math inline">\(\mathbf{V}\)</span> and describe the first column relative to others and how this relates to taking an average.</p>
+<p>10. We already saw that we can rewrite <span class="math inline">\(UD\)</span> as:</p>
+<p><span class="math display">\[
+\mathbf{u}_1 d_{1,1} + \mathbf{u}_2 d_{2,2} + \dots + \mathbf{u}_p d_{p,p}
+\]</span></p>
+<p>with <span class="math inline">\(\mathbf{u}_j\)</span> the j-th column of <span class="math inline">\(\mathbf{U}\)</span>. This implies that we can rewrite the entire SVD as:</p>
+<p><span class="math display">\[\mathbf{Y} = \mathbf{u}_1 d_{1,1} \mathbf{v}_1 ^{\top} + \mathbf{u}_2 d_{2,2} \mathbf{v}_2 ^{\top} + \dots + \mathbf{u}_p d_{p,p} \mathbf{v}_p ^{\top}\]</span></p>
+<p>with <span class="math inline">\(\mathbf{V}_j\)</span> the jth column of <span class="math inline">\(\mathbf{V}\)</span>. Plot <span class="math inline">\(\mathbf{u}_1\)</span>, then plot <span class="math inline">\(\mathbf{v}_1^{\top}\)</span> using the same range for the y-axis limits. Then make an image of <span class="math inline">\(\mathbf{u}_1 d_{1,1} \mathbf{v}_1 ^{\top}\)</span> and compare it to the image of <span class="math inline">\(\mathbf{Y}\)</span>. Hint: Use the <code>my_image</code> function defined above and use the <code>drop=FALSE</code> argument to assure the subsets of matrices are matrices.</p>
 <p>11. We see that with just a vector of length 100, a scalar, and a vector of length 24, we actually come close to reconstructing the original <span class="math inline">\(100 \times 24\)</span> matrix. This is our first matrix factorization:</p>
-<p><span class="math display">\[ Y \approx d_{1,1} U_1 V_1^{\top}\]</span> We know it explains <code>s$d[1]^2/sum(s$d^2) * 100</code> percent of the total variability. Our approximation only explains the observation that good students tend to be good in all subjects. But another aspect of the original data that our approximation does not explain was the higher similarity we observed within subjects. We can see this by computing the difference between our approximation and original data and then computing the correlations. You can see this by running this code:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-32_d6d207ea29cb61831d137990d7025d43">
-<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">resid</span> <span class="op">&lt;-</span> <span class="va">y</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">s</span>,<span class="op">(</span><span class="va">u</span><span class="op">[</span>,<span class="fl">1</span>, drop<span class="op">=</span><span class="cn">FALSE</span><span class="op">]</span><span class="op">*</span><span class="va">d</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">v</span><span class="op">[</span>,<span class="fl">1</span>, drop<span class="op">=</span><span class="cn">FALSE</span><span class="op">]</span><span class="op">)</span><span class="op">)</span></span>
+<p><span class="math display">\[
+\mathbf{Y} \approx d_{1,1} \mathbf{u}_1 \mathbf{v}_1^{\top}
+\]</span> We know it explains <code>s$d[1]^2/sum(s$d^2) * 100</code> percent of the total variability. Our approximation only explains the observation that good students tend to be good in all subjects. But another aspect of the original data that our approximation does not explain was the higher similarity we observed within subjects. We can see this by computing the difference between our approximation and original data and then computing the correlations. You can see this by running this code:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-35_e38f87072b8fa8de79892ee71760db8a">
+<div class="sourceCode" id="cb33"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">resid</span> <span class="op">&lt;-</span> <span class="va">y</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">s</span>,<span class="op">(</span><span class="va">u</span><span class="op">[</span>,<span class="fl">1</span>, drop<span class="op">=</span><span class="cn">FALSE</span><span class="op">]</span><span class="op">*</span><span class="va">d</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">v</span><span class="op">[</span>,<span class="fl">1</span>, drop<span class="op">=</span><span class="cn">FALSE</span><span class="op">]</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu">my_image</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">resid</span><span class="op">)</span>, zlim <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>,<span class="fl">1</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/axis.html">axis</a></span><span class="op">(</span>side <span class="op">=</span> <span class="fl">2</span>, <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/rev.html">rev</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span>, las <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Now that we have removed the overall student effect, the correlation plot reveals that we have not yet explained the within subject correlation nor the fact that math and science are closer to each other than to the arts. So let’s explore the second column of the SVD. Repeat the previous exercise but for the second column: Plot <span class="math inline">\(U_2\)</span>, then plot <span class="math inline">\(V_2^{\top}\)</span> using the same range for the y-axis limits, then make an image of <span class="math inline">\(U_2 d_{2,2} V_2 ^{\top}\)</span> and compare it to the image of <code>resid</code>.</p>
+<p>Now that we have removed the overall student effect, the correlation plot reveals that we have not yet explained the within subject correlation nor the fact that math and science are closer to each other than to the arts. So let’s explore the second column of the SVD. Repeat the previous exercise but for the second column: Plot <span class="math inline">\(\mathbf{u}_2\)</span>, then plot <span class="math inline">\(\mathbf{v}_2^{\top}\)</span> using the same range for the y-axis limits, then make an image of <span class="math inline">\(\mathbf{u}_2 d_{2,2} \mathbf{v}_2 ^{\top}\)</span> and compare it to the image of <code>resid</code>.</p>
 <p>12. The second column clearly relates to a student’s difference in ability in math/science versus the arts. We can see this most clearly from the plot of <code>s$v[,2]</code>. Adding the matrix we obtain with these two columns will help with our approximation:</p>
-<p><span class="math display">\[ Y \approx d_{1,1} U_1 V_1^{\top} + d_{2,2} U_2 V_2^{\top} \]</span></p>
-<p>We know it will explain</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-33_fbfd908c25e068072923e86e71000f6c">
-<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">s</span><span class="op">$</span><span class="va">d</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">]</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">s</span><span class="op">$</span><span class="va">d</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span> <span class="op">*</span> <span class="fl">100</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><span class="math display">\[
+\mathbf{Y} \approx d_{1,1} \mathbf{u}_1 \mathbf{v}_1^{\top} + d_{2,2} \mathbf{u}_2 \mathbf{v}_2^{\top}
+\]</span></p>
+<p>We know it will explain:</p>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-36_b00b802a0439652c13b9b1e76d770e31">
+<div class="sourceCode" id="cb34"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">s</span><span class="op">$</span><span class="va">d</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">]</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">s</span><span class="op">$</span><span class="va">d</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span> <span class="op">*</span> <span class="fl">100</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>percent of the total variability. We can compute new residuals like this:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-34_9585fe8d16530d389dedba6f8911ad22">
-<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">resid</span> <span class="op">&lt;-</span> <span class="va">y</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">s</span>,<span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">u</span><span class="op">[</span>,<span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">]</span>, <span class="fl">2</span>, <span class="va">d</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">]</span>, FUN<span class="op">=</span><span class="st">"*"</span><span class="op">)</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">v</span><span class="op">[</span>,<span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">]</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-37_eb24f1d9720fa8783886488541a428e8">
+<div class="sourceCode" id="cb35"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">resid</span> <span class="op">&lt;-</span> <span class="va">y</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">s</span>,<span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">u</span><span class="op">[</span>,<span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">]</span>, <span class="fl">2</span>, <span class="va">d</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">]</span>, FUN<span class="op">=</span><span class="st">"*"</span><span class="op">)</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">v</span><span class="op">[</span>,<span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">]</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu">my_image</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">resid</span><span class="op">)</span>, zlim <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>,<span class="fl">1</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/axis.html">axis</a></span><span class="op">(</span>side <span class="op">=</span> <span class="fl">2</span>, <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/rev.html">rev</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span>, las <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>and see that the structure that is left is driven by the differences between math and science. Confirm this by plotting <span class="math inline">\(U_3\)</span>, then plot <span class="math inline">\(V_3^{\top}\)</span> using the same range for the y-axis limits, then make an image of <span class="math inline">\(U_3 d_{3,3} V_3 ^{\top}\)</span> and compare it to the image of <code>resid</code>.</p>
+<p>and see that the structure that is left is driven by the differences between math and science. Confirm this by plotting <span class="math inline">\(\mathbf{u}_3\)</span>, then plot <span class="math inline">\(\mathbf{v}_3^{\top}\)</span> using the same range for the y-axis limits, then make an image of <span class="math inline">\(\mathbf{u}_3 d_{3,3} \mathbf{v}_3 ^{\top}\)</span> and compare it to the image of <code>resid</code>.</p>
 <p>13. The third column clearly relates to a student’s difference in ability in math and science. We can see this most clearly from the plot of <code>s$v[,3]</code>. Adding the matrix we obtain with these two columns will help with our approximation:</p>
-<p><span class="math display">\[ Y \approx d_{1,1} U_1 V_1^{\top} + d_{2,2} U_2 V_2^{\top} + d_{3,3} U_3 V_3^{\top}\]</span></p>
+<p><span class="math display">\[
+\mathbf{Y} \approx d_{1,1} \mathbf{u}_1 \mathbf{v}_1^{\top} + d_{2,2} \mathbf{u}_2 \mathbf{v}_2^{\top} + d_{3,3} \mathbf{u}_3 \mathbf{v}_3^{\top}
+\]</span></p>
 <p>We know it will explain:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-35_57769ec46d09d46d341af4ce6ca4826a">
-<div class="sourceCode" id="cb33"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">s</span><span class="op">$</span><span class="va">d</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">]</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">s</span><span class="op">$</span><span class="va">d</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span> <span class="op">*</span> <span class="fl">100</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-38_68226186f4e158dbdc35528a34008d3b">
+<div class="sourceCode" id="cb36"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">s</span><span class="op">$</span><span class="va">d</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">]</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">s</span><span class="op">$</span><span class="va">d</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span> <span class="op">*</span> <span class="fl">100</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>percent of the total variability. We can compute new residuals like this:</p>
-<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-36_672b593e392f4abe6a6bdd3d88d17024">
-<div class="sourceCode" id="cb34"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">resid</span> <span class="op">&lt;-</span> <span class="va">y</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">s</span>,<span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">u</span><span class="op">[</span>,<span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">]</span>, <span class="fl">2</span>, <span class="va">d</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">]</span>, FUN<span class="op">=</span><span class="st">"*"</span><span class="op">)</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">v</span><span class="op">[</span>,<span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">]</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="matrix-factorization_cache/html/unnamed-chunk-39_d24fb69465d8725b163a0f811ee95cf1">
+<div class="sourceCode" id="cb37"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">resid</span> <span class="op">&lt;-</span> <span class="va">y</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">s</span>,<span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">u</span><span class="op">[</span>,<span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">]</span>, <span class="fl">2</span>, <span class="va">d</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">]</span>, FUN<span class="op">=</span><span class="st">"*"</span><span class="op">)</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">v</span><span class="op">[</span>,<span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">]</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu">my_image</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">resid</span><span class="op">)</span>, zlim <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>,<span class="fl">1</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/axis.html">axis</a></span><span class="op">(</span>side <span class="op">=</span> <span class="fl">2</span>, <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/rev.html">rev</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span>, las <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We no longer see structure in the residuals: they seem to be independent of each other. This implies that we can describe the data with the following model:</p>
-<p><span class="math display">\[ Y =  d_{1,1} U_1 V_1^{\top} + d_{2,2} U_2 V_2^{\top} + d_{3,3} U_3 V_3^{\top} + \varepsilon\]</span></p>
-<p>with <span class="math inline">\(\varepsilon\)</span> a matrix of independent identically distributed errors. This model is useful because we summarize of <span class="math inline">\(100 \times 24\)</span> observations with <span class="math inline">\(3 \times (100+24+1) = 375\)</span> numbers. Furthermore, the three components of the model have useful interpretations: 1) the overall ability of a student, 2) the difference in ability between the math/sciences and arts, and 3) the remaining differences between the three subjects. The sizes <span class="math inline">\(d_{1,1}, d_{2,2}\)</span> and <span class="math inline">\(d_{3,3}\)</span> tell us the variability explained by each component. Finally, note that the components <span class="math inline">\(d_{j,j} U_j V_j^{\top}\)</span> are equivalent to the jth principal component.</p>
-<p>Finish the exercise by plotting an image of <span class="math inline">\(Y\)</span>, an image of <span class="math inline">\(d_{1,1} U_1 V_1^{\top} + d_{2,2} U_2 V_2^{\top} + d_{3,3} U_3 V_3^{\top}\)</span> and an image of the residuals, all with the same <code>zlim</code>.</p>
-<p>14. Advanced. The <code>movielens</code> dataset included in the <strong>dslabs</strong> package is a small subset of a larger dataset with millions of ratings. You can find the entire latest dataset here <a href="https://grouplens.org/datasets/movielens/20m/">https://grouplens.org/datasets/movielens/20m/</a>. Create your own recommendation system using all the tools we have shown you.</p>
+<p><span class="math display">\[
+\mathbf{Y} =  \mathbf{u}_1 \mathbf{v}_1^{\top} + d_{2,2} \mathbf{u}_2 \mathbf{v}_2^{\top} + d_{3,3} \mathbf{u}_3 \mathbf{v}_3^{\top} + \varepsilon
+\]</span></p>
+<p>with <span class="math inline">\(\varepsilon\)</span> a matrix of independent identically distributed errors. This model is useful because we summarize <span class="math inline">\(100 \times 24\)</span> observations with <span class="math inline">\(3 \times (100+24+1) = 375\)</span> numbers. Furthermore, the three components of the model have useful interpretations: 1) the overall ability of a student, 2) the difference in ability between the math/sciences and arts, and 3) the remaining differences between the three subjects. The sizes <span class="math inline">\(d_{1,1}, d_{2,2}\)</span> and <span class="math inline">\(d_{3,3}\)</span> tell us the variability explained by each component. Finally, note that the components <span class="math inline">\(d_{j,j} \mathbf{u}_j \mathbf{v}_j^{\top}\)</span> are equivalent to the jth principal component.</p>
+<p>Finish the exercise by plotting an image of <span class="math inline">\(Y\)</span>, an image of <span class="math inline">\(d_{1,1} \mathbf{u}_1 \mathbf{v}_1^{\top} + d_{2,2} \mathbf{u}_2 \mathbf{v}_2^{\top} + d_{3,3} \mathbf{u}_3 \mathbf{v}_3^{\top}\)</span> and an image of the residuals, all with the same <code>zlim</code>.</p>
 
 
 </section></main><!-- /main --><script id="quarto-html-after-body" type="application/javascript">
@@ -1036,7 +1109,7 @@ <h1 class="title">
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../highdim/regularization.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
diff --git a/docs/highdim/matrix-factorization_files/figure-html/movie-cor-1.png b/docs/highdim/matrix-factorization_files/figure-html/movie-cor-1.png
index 8087291..a9de493 100644
Binary files a/docs/highdim/matrix-factorization_files/figure-html/movie-cor-1.png and b/docs/highdim/matrix-factorization_files/figure-html/movie-cor-1.png differ
diff --git a/docs/highdim/matrix-factorization_files/figure-html/movies-pca-1.png b/docs/highdim/matrix-factorization_files/figure-html/movies-pca-1.png
index 6e5f4a9..48454bb 100644
Binary files a/docs/highdim/matrix-factorization_files/figure-html/movies-pca-1.png and b/docs/highdim/matrix-factorization_files/figure-html/movies-pca-1.png differ
diff --git a/docs/highdim/matrix-factorization_files/figure-html/pca-sds-1.png b/docs/highdim/matrix-factorization_files/figure-html/pca-sds-1.png
deleted file mode 100644
index b7e5d08..0000000
Binary files a/docs/highdim/matrix-factorization_files/figure-html/pca-sds-1.png and /dev/null differ
diff --git a/docs/highdim/regularization.html b/docs/highdim/regularization.html
index 93840ee..6e542c8 100644
--- a/docs/highdim/regularization.html
+++ b/docs/highdim/regularization.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 22&nbsp; Regularization</title>
+<title>Advanced Data Science - 23&nbsp; Regularization</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -99,7 +99,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../highdim/intro-highdim.html">High dimensional data</a></li><li class="breadcrumb-item"><a href="../highdim/regularization.html"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../highdim/intro-highdim.html">High dimensional data</a></li><li class="breadcrumb-item"><a href="../highdim/regularization.html"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -224,23 +224,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -257,37 +263,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -304,31 +310,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -345,49 +351,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -401,23 +407,23 @@
    
   <ul>
 <li>
-<a href="#sec-recommendation-systems" id="toc-sec-recommendation-systems" class="nav-link active" data-scroll-target="#sec-recommendation-systems"><span class="header-section-number">22.1</span> Case study: recommendation systems</a>
+<a href="#sec-recommendation-systems" id="toc-sec-recommendation-systems" class="nav-link active" data-scroll-target="#sec-recommendation-systems"><span class="header-section-number">23.1</span> Case study: recommendation systems</a>
   <ul class="collapse">
-<li><a href="#movielens-data" id="toc-movielens-data" class="nav-link" data-scroll-target="#movielens-data"><span class="header-section-number">22.1.1</span> Movielens data</a></li>
+<li><a href="#movielens-data" id="toc-movielens-data" class="nav-link" data-scroll-target="#movielens-data"><span class="header-section-number">23.1.1</span> Movielens data</a></li>
   </ul>
 </li>
-  <li><a href="#sec-netflix-loss-function" id="toc-sec-netflix-loss-function" class="nav-link" data-scroll-target="#sec-netflix-loss-function"><span class="header-section-number">22.2</span> Loss function</a></li>
-  <li><a href="#a-first-model" id="toc-a-first-model" class="nav-link" data-scroll-target="#a-first-model"><span class="header-section-number">22.3</span> A first model</a></li>
-  <li><a href="#modeling-movie-effects" id="toc-modeling-movie-effects" class="nav-link" data-scroll-target="#modeling-movie-effects"><span class="header-section-number">22.4</span> Modeling movie effects</a></li>
-  <li><a href="#user-effects" id="toc-user-effects" class="nav-link" data-scroll-target="#user-effects"><span class="header-section-number">22.5</span> User effects</a></li>
-  <li><a href="#penalized-least-squares" id="toc-penalized-least-squares" class="nav-link" data-scroll-target="#penalized-least-squares"><span class="header-section-number">22.6</span> Penalized least squares</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">22.7</span> Exercises</a></li>
+  <li><a href="#sec-netflix-loss-function" id="toc-sec-netflix-loss-function" class="nav-link" data-scroll-target="#sec-netflix-loss-function"><span class="header-section-number">23.2</span> Loss function</a></li>
+  <li><a href="#a-first-model" id="toc-a-first-model" class="nav-link" data-scroll-target="#a-first-model"><span class="header-section-number">23.3</span> A first model</a></li>
+  <li><a href="#user-effects" id="toc-user-effects" class="nav-link" data-scroll-target="#user-effects"><span class="header-section-number">23.4</span> User effects</a></li>
+  <li><a href="#movie-effects" id="toc-movie-effects" class="nav-link" data-scroll-target="#movie-effects"><span class="header-section-number">23.5</span> Movie effects</a></li>
+  <li><a href="#penalized-least-squares" id="toc-penalized-least-squares" class="nav-link" data-scroll-target="#penalized-least-squares"><span class="header-section-number">23.6</span> Penalized least squares</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">23.7</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/highdim/regularization.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
 <h1 class="title">
-<span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span>
+<span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span>
 </h1>
 </div>
 
@@ -431,41 +437,45 @@ <h1 class="title">
   </div>
   
 
-</header><section id="sec-recommendation-systems" class="level2" data-number="22.1"><h2 data-number="22.1" class="anchored" data-anchor-id="sec-recommendation-systems">
-<span class="header-section-number">22.1</span> Case study: recommendation systems</h2>
-<p>Recommendation systems use ratings that <em>users</em> have given <em>items</em> to make specific recommendations. Companies that sell many products to many customers and permit these customers to rate their products, like Amazon, are able to collect massive datasets that can be used to predict what rating a particular user will give a specific item. Items for which a high rating is predicted for a given user are then recommended to that user.</p>
-<p>Netflix uses a recommendation system to predict how many <em>stars</em> a user will give a specific movie. One star suggests it is not a good movie, whereas five stars suggests it is an excellent movie. Here, we provide the basics of how these recommendations are made, motivated by some of the approaches taken by the winners of the <em>Netflix challenges</em>.</p>
-<p>In October 2006, Netflix offered a challenge to the data science community: improve our recommendation algorithm by 10% and win a million dollars. In September 2009, the winners were announced<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>. You can read a good summary of how the winning algorithm was put together here: <a href="http://blog.echen.me/2011/10/24/winning-the-netflix-prize-a-summary/">http://blog.echen.me/2011/10/24/winning-the-netflix-prize-a-summary/</a> and a more detailed explanation here: <a href="https://www2.seas.gwu.edu/~simhaweb/champalg/cf/papers/KorenBellKor2009.pdf">https://www2.seas.gwu.edu/~simhaweb/champalg/cf/papers/KorenBellKor2009.pdf</a>. We will now show you some of the data analysis strategies used by the winning team.</p>
-<section id="movielens-data" class="level3" data-number="22.1.1"><h3 data-number="22.1.1" class="anchored" data-anchor-id="movielens-data">
-<span class="header-section-number">22.1.1</span> Movielens data</h3>
+</header><section id="sec-recommendation-systems" class="level2" data-number="23.1"><h2 data-number="23.1" class="anchored" data-anchor-id="sec-recommendation-systems">
+<span class="header-section-number">23.1</span> Case study: recommendation systems</h2>
+<p>Recommendation systems, such as the one used by Amazon, operate by analyzing the ratings that customers give to various products. These ratings form a large dataset. The system uses this data to predict how likely a specific user is to favorably rate a particular product. For example, if the system predicts that a user is likely to give a high rating to a certain book or gadget, it will recommend that item to them. In essence, the system tries to guess which products a user will like based on the ratings provided by them and other customers for various items. This approach helps in personalizing recommendations to suit individual preferences.</p>
+<p>During its initial years of operation, Netflix used a 5-star recommendation system. One star suggested it was not a good movie, whereas five stars suggested it was an excellent movie. Here, we provide the basics of how these recommendations are made, motivated by some of the approaches taken by the winners of the <em>Netflix challenges</em>.</p>
+<p>In October 2006, Netflix offered a challenge to the data science community: improve our recommendation algorithm by 10% and win a million dollars. In September 2009, the winners were announced<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>. You can read a summary of how the winning algorithm was put together here: <a href="http://blog.echen.me/2011/10/24/winning-the-netflix-prize-a-summary/">http://blog.echen.me/2011/10/24/winning-the-netflix-prize-a-summary/</a> and a more detailed explanation here: <a href="https://www2.seas.gwu.edu/~simhaweb/champalg/cf/papers/KorenBellKor2009.pdf">https://www2.seas.gwu.edu/~simhaweb/champalg/cf/papers/KorenBellKor2009.pdf</a>. We will now show you some of the data analysis strategies used by the winning team.</p>
+<section id="movielens-data" class="level3" data-number="23.1.1"><h3 data-number="23.1.1" class="anchored" data-anchor-id="movielens-data">
+<span class="header-section-number">23.1.1</span> Movielens data</h3>
 <p>The Netflix data is not publicly available, but the GroupLens research lab<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a> generated their own database with over 20 million ratings for over 27,000 movies by more than 138,000 users. We make a small subset of this data available via the <strong>dslabs</strong> package:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-2_b6d32671a164ee32bb0e95c34f02dcd1">
-<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">movielens</span> <span class="op">|&gt;</span> <span class="fu">as_tibble</span><span class="op">(</span><span class="op">)</span></span>
-<span><span class="co">#&gt; # A tibble: 100,004 × 7</span></span>
+<div class="cell" data-layout-align="center">
+<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
+<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/sfirke/janitor">janitor</a></span><span class="op">)</span></span>
+<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
+<span><span class="va">movielens</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://tibble.tidyverse.org/reference/as_tibble.html">as_tibble</a></span><span class="op">(</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/utils/head.html">head</a></span><span class="op">(</span><span class="fl">5</span><span class="op">)</span></span>
+<span><span class="co">#&gt; # A tibble: 5 × 7</span></span>
 <span><span class="co">#&gt;   movieId title                      year genres userId rating timestamp</span></span>
 <span><span class="co">#&gt;     &lt;int&gt; &lt;chr&gt;                     &lt;int&gt; &lt;fct&gt;   &lt;int&gt;  &lt;dbl&gt;     &lt;int&gt;</span></span>
 <span><span class="co">#&gt; 1      31 Dangerous Minds            1995 Drama       1    2.5    1.26e9</span></span>
 <span><span class="co">#&gt; 2    1029 Dumbo                      1941 Anima…      1    3      1.26e9</span></span>
 <span><span class="co">#&gt; 3    1061 Sleepers                   1996 Thril…      1    3      1.26e9</span></span>
 <span><span class="co">#&gt; 4    1129 Escape from New York       1981 Actio…      1    2      1.26e9</span></span>
-<span><span class="co">#&gt; 5    1172 Cinema Paradiso (Nuovo c…  1989 Drama       1    4      1.26e9</span></span>
-<span><span class="co">#&gt; # ℹ 99,999 more rows</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt; 5    1172 Cinema Paradiso (Nuovo c…  1989 Drama       1    4      1.26e9</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Each row represents a rating given by one user to one movie.</p>
+<p>It will later be convenient that our <code>userId</code> and <code>movieId</code> are factors, so we change that:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-2_b43d6e89cde125a1aa7ab7c299f5f3e8">
+<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">movielens</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span><span class="va">movielens</span>, userId <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">userId</span><span class="op">)</span>, movieId <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">movieId</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
 <p>We can see the number of unique users that provided ratings and how many unique movies were rated:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-3_c60969c5cc7bf5c071b164cff78114b1">
-<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">movielens</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">summarize</span><span class="op">(</span>n_users <span class="op">=</span> <span class="fu">n_distinct</span><span class="op">(</span><span class="va">userId</span><span class="op">)</span>,</span>
-<span>            n_movies <span class="op">=</span> <span class="fu">n_distinct</span><span class="op">(</span><span class="va">movieId</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="co">#&gt;   n_users n_movies</span></span>
-<span><span class="co">#&gt; 1     671     9066</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>If we multiply those two numbers, we get a number larger than 5 million, yet our data table has about 100,000 rows. This implies that not every user rated every movie. So we can think of these data as a very large matrix, with users on the rows and movies on the columns, with many empty cells. The <code>pivot_wider</code> function permits us to convert it to this format, but if we try it for the entire matrix, it will crash R. Let’s show the matrix for six users and four movies.</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-4_72d2bcb948329ca48ee6e4312d5aad65">
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-3_eff847ae24cdfcac8bd9afccb905b47b">
+<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">movielens</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/n_distinct.html">n_distinct</a></span><span class="op">(</span><span class="va">userId</span><span class="op">)</span>, <span class="fu"><a href="https://dplyr.tidyverse.org/reference/n_distinct.html">n_distinct</a></span><span class="op">(</span><span class="va">movieId</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="co">#&gt;   n_distinct(userId) n_distinct(movieId)</span></span>
+<span><span class="co">#&gt; 1                671                9066</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>If we multiply those two numbers, we get a number larger than 5 million, yet our data table has about 100,000 rows. This implies that not every user rated every movie. We can think of these data as a very large matrix, with users on the rows and movies on the columns, with many empty cells. Here is the matrix for six users and four movies:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-4_d3d35d2383e78ad039082223e94a65a2">
 <div class="cell-output-display">
 <table class="table table-striped table-sm small" data-quarto-postprocess="true">
 <thead><tr class="header">
-<th style="text-align: right;" data-quarto-table-cell-role="th">userId</th>
+<th style="text-align: left;" data-quarto-table-cell-role="th">userId</th>
 <th style="text-align: right;" data-quarto-table-cell-role="th">Pulp Fiction</th>
 <th style="text-align: right;" data-quarto-table-cell-role="th">Shawshank Redemption</th>
 <th style="text-align: right;" data-quarto-table-cell-role="th">Forrest Gump</th>
@@ -473,42 +483,42 @@ <h1 class="title">
 </tr></thead>
 <tbody>
 <tr class="odd">
-<td style="text-align: right;">13</td>
+<td style="text-align: left;">13</td>
 <td style="text-align: right;">3.5</td>
 <td style="text-align: right;">4.5</td>
 <td style="text-align: right;">5.0</td>
 <td style="text-align: right;">NA</td>
 </tr>
 <tr class="even">
-<td style="text-align: right;">15</td>
+<td style="text-align: left;">15</td>
 <td style="text-align: right;">5.0</td>
 <td style="text-align: right;">2.0</td>
 <td style="text-align: right;">1.0</td>
 <td style="text-align: right;">5.0</td>
 </tr>
 <tr class="odd">
-<td style="text-align: right;">16</td>
+<td style="text-align: left;">16</td>
 <td style="text-align: right;">NA</td>
 <td style="text-align: right;">4.0</td>
 <td style="text-align: right;">NA</td>
 <td style="text-align: right;">NA</td>
 </tr>
 <tr class="even">
-<td style="text-align: right;">17</td>
+<td style="text-align: left;">17</td>
 <td style="text-align: right;">5.0</td>
 <td style="text-align: right;">5.0</td>
 <td style="text-align: right;">2.5</td>
 <td style="text-align: right;">4.5</td>
 </tr>
 <tr class="odd">
-<td style="text-align: right;">19</td>
+<td style="text-align: left;">19</td>
 <td style="text-align: right;">5.0</td>
 <td style="text-align: right;">4.0</td>
 <td style="text-align: right;">5.0</td>
 <td style="text-align: right;">3.0</td>
 </tr>
 <tr class="even">
-<td style="text-align: right;">20</td>
+<td style="text-align: left;">20</td>
 <td style="text-align: right;">0.5</td>
 <td style="text-align: right;">4.5</td>
 <td style="text-align: right;">2.0</td>
@@ -518,7 +528,7 @@ <h1 class="title">
 </table>
 </div>
 </div>
-<p>You can think of the task of a recommendation system as filling in the <code>NA</code>s in the table above. To see how <em>sparse</em> the matrix is, here is the matrix for a random sample of 100 movies and 100 users with yellow indicating a user/movie combination for which we have a rating.</p>
+<p>You can think of the task of a recommendation system as filling in the <code>NA</code>s in the table above. To see how <em>sparse</em> the matrix is, here is the matrix for a random sample of 100 movies and 100 users with yellow indicating a user/movie combination for which we have a rating:</p>
 <div class="cell" data-layout-align="center" data-hash="regularization_cache/html/sparsity-of-movie-recs_33e275d819aa15e2a98ffb35c72a47ad">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -528,8 +538,8 @@ <h1 class="title">
 </div>
 </div>
 <p>Let’s look at some of the general properties of the data to better understand the challenges.</p>
-<p>The first thing we notice is that some movies get rated more than others. Below is the distribution. This should not surprise us given that there are blockbuster movies watched by millions and artsy, independent movies watched by just a few. Our second observation is that some users are more active than others at rating movies:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/movie-id-and-user-hists_35f4ae5d9d2b7f2c8210d0ae442a2c9c">
+<p>The first thing we notice is that some movies get rated more than others. Below is the distribution. This is not surprising given that there are blockbuster movies watched by millions and artsy, independent movies watched by just a few. Our second observation is that some users are more active than others at rating movies:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/movie-id-and-user-hists_a46a2bc5787f4ede9133d44174d870c1">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="regularization_files/figure-html/movie-id-and-user-hists-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -537,203 +547,210 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>We need to build an algorithm with data we have collected that will then be applied outside our control, as users look for movie recommendations. So let’s create a test set to assess the accuracy of the models we implement. We only consider movies rated five times or more, and users that have rated more than 100 of these movies. We then split the data into a training set and test set by assiging 20% of the ratings made by each user to the test set:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-5_798cbddc5f2159a06095e714f1ee4e6d">
-<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">2006</span><span class="op">)</span></span>
+<p>We need to build an algorithm with the collected data that will then be applied outside our control when users look for movie recommendations. To test our idea, we will split the data into a training set, which we will use to develop our approach, and a test set in which we will compute the accuracy of our predictions.</p>
+<p>We will do this only for users that have provided at least 100 ratings.</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-5_3ce5d2f80f0cced14e6c75492434215b">
+<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">movielens</span> <span class="op">&lt;-</span> <span class="va">movielens</span> <span class="op">|&gt;</span> </span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">userId</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/context.html">n</a></span><span class="op">(</span><span class="op">)</span> <span class="op">&gt;=</span> <span class="fl">100</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">ungroup</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>For each one of these users, we will split their ratings into 80% for training and 20% for testing.</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-6_eabf9dc7314519d11fb3bfd2087063f6">
+<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">2006</span><span class="op">)</span></span>
 <span><span class="va">indexes</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/split.html">split</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">movielens</span><span class="op">)</span>, <span class="va">movielens</span><span class="op">$</span><span class="va">userId</span><span class="op">)</span></span>
-<span><span class="va">test_ind</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">indexes</span>, <span class="kw">function</span><span class="op">(</span><span class="va">ind</span><span class="op">)</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">ind</span>, <span class="fu"><a href="https://rdrr.io/r/base/Round.html">ceiling</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">ind</span><span class="op">)</span><span class="op">*</span><span class="fl">.2</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/base/unlist.html">unlist</a></span><span class="op">(</span>use.names <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/base/sort.html">sort</a></span><span class="op">(</span><span class="op">)</span></span>
-<span><span class="va">test_set</span> <span class="op">&lt;-</span> <span class="va">movielens</span><span class="op">[</span><span class="va">test_ind</span>,<span class="op">]</span></span>
+<span><span class="va">test_ind</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">indexes</span>, <span class="kw">function</span><span class="op">(</span><span class="va">i</span><span class="op">)</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">i</span>, <span class="fu"><a href="https://rdrr.io/r/base/Round.html">ceiling</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">i</span><span class="op">)</span><span class="op">*</span><span class="fl">.2</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<span>  <span class="fu"><a href="https://rdrr.io/r/base/unlist.html">unlist</a></span><span class="op">(</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://rdrr.io/r/base/sort.html">sort</a></span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">test_set</span> <span class="op">&lt;-</span> <span class="va">movielens</span><span class="op">[</span><span class="va">test_ind</span>,<span class="op">]</span> </span>
 <span><span class="va">train_set</span> <span class="op">&lt;-</span> <span class="va">movielens</span><span class="op">[</span><span class="op">-</span><span class="va">test_ind</span>,<span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To make sure we don’t include movies that are not in both test and train sets, we remove entries using the <code>semi_join</code> function:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-6_e7ce1f830e5be4842f1f9c0c780e0968">
-<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">test_set</span> <span class="op">&lt;-</span> <span class="va">test_set</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">semi_join</span><span class="op">(</span><span class="va">train_set</span>, by <span class="op">=</span> <span class="st">"movieId"</span><span class="op">)</span></span>
-<span><span class="va">train_set</span> <span class="op">&lt;-</span> <span class="va">train_set</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">semi_join</span><span class="op">(</span><span class="va">test_set</span>, by <span class="op">=</span> <span class="st">"movieId"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Finally we use <code>pivot_wider</code> to make a matrix with users represented by rows and movies by the columns</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-7_c748ac1bb73ecef21b5fb7bf08c9deb1">
-<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fu">select</span><span class="op">(</span><span class="va">train_set</span>, <span class="va">movieId</span>, <span class="va">userId</span>, <span class="va">rating</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu">pivot_wider</span><span class="op">(</span>names_from <span class="op">=</span> <span class="va">movieId</span>, values_from <span class="op">=</span> <span class="va">rating</span><span class="op">)</span> </span>
-<span><span class="va">rnames</span> <span class="op">&lt;-</span> <span class="va">y</span><span class="op">$</span><span class="va">userId</span></span>
-<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">as.matrix</a></span><span class="op">(</span><span class="va">y</span><span class="op">[</span>,<span class="op">-</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">rownames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">rnames</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>along with a table to map movie ids to titles:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-8_05874a5ce887ec0c8b0f464744b8096c">
-<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">movie_map</span> <span class="op">&lt;-</span> <span class="va">train_set</span> <span class="op">|&gt;</span> <span class="fu">select</span><span class="op">(</span><span class="va">movieId</span>, <span class="va">title</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu">distinct</span><span class="op">(</span><span class="va">movieId</span>, .keep_all <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-</section></section><section id="sec-netflix-loss-function" class="level2" data-number="22.2"><h2 data-number="22.2" class="anchored" data-anchor-id="sec-netflix-loss-function">
-<span class="header-section-number">22.2</span> Loss function</h2>
-<p>The Netflix challenge decided on a winner based on the residual mean squared error (RMSE) on a test set. We define <span class="math inline">\(y_{u,i}\)</span> as the rating for movie <span class="math inline">\(i\)</span> by user <span class="math inline">\(u\)</span> and denote our prediction with <span class="math inline">\(\hat{y}_{u,i}\)</span>. The RMSE is then defined as:</p>
-<p><span class="math display">\[
-\mbox{RMSE} = \sqrt{\frac{1}{N} \sum_{u,i}^{} \left( \hat{y}_{u,i} - y_{u,i} \right)^2 }
-\]</span> with <span class="math inline">\(N\)</span> being the number of user/movie combinations and the sum occurring over all these combinations.</p>
-<p>We can interpret the RMSE similarly to a standard deviation: it is the typical error we make when predicting a movie rating. If this number is larger than 1, it means our typical error is larger than one star, which is not good. In R, we can define a function to compute this quantity like this:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-9_d6d8f6e284b072827980005a44156297">
-<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">RMSE</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">true_ratings</span>, <span class="va">predicted_ratings</span><span class="op">)</span><span class="op">{</span></span>
-<span>    <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="op">(</span><span class="va">true_ratings</span> <span class="op">-</span> <span class="va">predicted_ratings</span><span class="op">)</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
-<span>  <span class="op">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>In the next two chapters we introduce two concepts, regularization and matrix factorization, that were used by the winners of the Netflix challenge to obtain lowest RMSE.</p>
-</section><section id="a-first-model" class="level2" data-number="22.3"><h2 data-number="22.3" class="anchored" data-anchor-id="a-first-model">
-<span class="header-section-number">22.3</span> A first model</h2>
-<p>Let’s start by building the simplest possible recommendation system: we predict the same rating for all movies regardless of user. What number should this prediction be? We can use a model based approach to answer this. A model that assumes the same rating for all movies and users with all the differences explained by random variation would look like this:</p>
+<p>To make sure we don’t include movies in the training set that should not be there, we remove entries using the <code>semi_join</code> function:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-7_03a3c1c9b13249191098d50231f91e1f">
+<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">test_set</span> <span class="op">&lt;-</span> <span class="va">test_set</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter-joins.html">semi_join</a></span><span class="op">(</span><span class="va">train_set</span>, by <span class="op">=</span> <span class="st">"movieId"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>We will use the array representation described in <a href="../linear-models/treatment-effect-models.html#sec-anova"><span>Section&nbsp;17.5</span></a>, for the training data: we denote ranking for movie <span class="math inline">\(j\)</span> by user <span class="math inline">\(i\)</span> as <span class="math inline">\(y_{i,j}\)</span>. To create this matrix, we use <code>pivot_wider</code>:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-8_7a322e848fe3d7e3efa1ae8118d11edd">
+<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">train_set</span>, <span class="va">movieId</span>, <span class="va">userId</span>, <span class="va">rating</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://tidyr.tidyverse.org/reference/pivot_wider.html">pivot_wider</a></span><span class="op">(</span>names_from <span class="op">=</span> <span class="va">movieId</span>, values_from <span class="op">=</span> <span class="va">rating</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://tibble.tidyverse.org/reference/rownames.html">column_to_rownames</a></span><span class="op">(</span><span class="st">"userId"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">as.matrix</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>To be able to map movie IDs to titles we create the following lookup table:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-9_0edb92308f8e9676e05a7100f9a7783a">
+<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">movie_map</span> <span class="op">&lt;-</span> <span class="va">train_set</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">movieId</span>, <span class="va">title</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/distinct.html">distinct</a></span><span class="op">(</span><span class="va">movieId</span>, .keep_all <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Note that two different movies can have the same title. For example, our dataset has three movies titled “King Kong”. Titles are therefore not unique and we can’t use them as IDs.</p>
+</section></section><section id="sec-netflix-loss-function" class="level2" data-number="23.2"><h2 data-number="23.2" class="anchored" data-anchor-id="sec-netflix-loss-function">
+<span class="header-section-number">23.2</span> Loss function</h2>
+<p>The Netflix challenge decided on a winner based on the root mean squared error (RMSE) computed on the test set. Specifically, if <span class="math inline">\(y_{i,j}\)</span> is the rating for movie <span class="math inline">\(j\)</span> by user <span class="math inline">\(i\)</span> <strong>in the test set</strong> and <span class="math inline">\(\hat{y}_{i,j}\)</span> is our prediction based on the training set, RMSE was defined as:</p>
 <p><span class="math display">\[
-Y_{u,i} = \mu + \varepsilon_{u,i}
+\mbox{RMSE} = \sqrt{\frac{1}{N} \sum_{i,j}^{N} (y_{i,j} - \hat{y}_{i,j})^2}
 \]</span></p>
-<p>with <span class="math inline">\(\varepsilon_{i,u}\)</span> independent errors sampled from the same distribution centered at 0 and <span class="math inline">\(\mu\)</span> the <em>true</em> rating for all movies. We know that the estimate that minimizes the RMSE is the least squares estimate of <span class="math inline">\(\mu\)</span> and, in this case, is the average of all ratings:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-10_e2a8772fb63c098e8350a278375bff75">
-<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mu</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">y</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
-<span><span class="va">mu</span></span>
-<span><span class="co">#&gt; [1] 3.58</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>If we predict all unknown ratings with <span class="math inline">\(\hat{\mu}\)</span> we obtain the following RMSE:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-11_c41f9d77c08f5455bc7496c56a9adf58">
-<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">naive_rmse</span> <span class="op">&lt;-</span> <span class="fu">RMSE</span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">rating</span>, <span class="va">mu</span><span class="op">)</span></span>
-<span><span class="va">naive_rmse</span></span>
-<span><span class="co">#&gt; [1] 1.05</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>with <span class="math inline">\(N\)</span> being the number of user/movie combinations for which we made predictions and the sum occurring over all these combinations.</p>
+<p>We can interpret the RMSE similarly to a standard deviation: it is the typical error we make when predicting a movie rating. If this number is larger than 1, it means our typical error is larger than one star, which is not good. We define a function to compute this quantity for any set of residuals:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-10_d73774ccee596818b066f10728391a28">
+<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">rmse</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">r</span><span class="op">)</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">r</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Keep in mind that if you plug in any other number, you get a higher RMSE. For example:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-12_7d3fe535ccf2361fd8c6725f58f29af4">
-<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">predictions</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html">rep</a></span><span class="op">(</span><span class="fl">3</span>, <span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="fu">RMSE</span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">rating</span>, <span class="va">predictions</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 1.19</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>From looking at the distribution of ratings, we can visualize that this is the standard deviation of that distribution. We get a RMSE of about 1. To win the grand prize of $1,000,000, a participating team had to get an RMSE of about 0.857. So we can definitely do better!</p>
-<p>As we go along, we will be comparing different approaches.</p>
-</section><section id="modeling-movie-effects" class="level2" data-number="22.4"><h2 data-number="22.4" class="anchored" data-anchor-id="modeling-movie-effects">
-<span class="header-section-number">22.4</span> Modeling movie effects</h2>
-<p>We know from experience that some movies are just generally rated higher than others. This intuition, that different movies are rated differently, is confirmed by data. We can use a linear models with a <em>treatment effect</em> <span class="math inline">\(b_i\)</span> for each movie, which can be interpreted as movie effect or the difference between the average ranking for movie <span class="math inline">\(i\)</span> and the overall average <span class="math inline">\(\mu\)</span>:</p>
+<p>In this chapter and the next, we introduce two concepts, regularization and matrix factorization, that were used by the winners of the Netflix challenge to obtain the winning RMSE.</p>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>In <a href="../ml/cross-validation.html"><span>Chapter&nbsp;29</span></a>, we provide a formal discussion of the mean squared error.</p>
+</div>
+</div>
+</div>
+</section><section id="a-first-model" class="level2" data-number="23.3"><h2 data-number="23.3" class="anchored" data-anchor-id="a-first-model">
+<span class="header-section-number">23.3</span> A first model</h2>
+<p>Let’s start by building the simplest possible recommendation system: we predict the same rating for all movies regardless of user. What number should this prediction be? We can use a model based approach to answer this. A model that assumes the same rating for all movies and users with all the differences explained by random variation would look as follows:</p>
 <p><span class="math display">\[
-Y_{u,i} = \mu + b_i + \varepsilon_{u,i}
+Y_{i,j} = \mu + \varepsilon_{i,j}
 \]</span></p>
-<p>Statistics textbooks refer to the <span class="math inline">\(b\)</span>s as treatment effects, however, in the Netflix challenge papers, they refer to them as <em>bias</em>, thus the <span class="math inline">\(b\)</span> notation.</p>
-<p>We can again use least squares to estimate the <span class="math inline">\(b_i\)</span> in the following way:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-14_affe7148b894455b03e55015f7b718de">
-<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">rating</span> <span class="op">~</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">as.factor</a></span><span class="op">(</span><span class="va">movieId</span><span class="op">)</span>, data <span class="op">=</span> <span class="va">movielens</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>with <span class="math inline">\(\varepsilon_{i,j}\)</span> independent errors sampled from the same distribution centered at 0 and <span class="math inline">\(\mu\)</span> the <em>true</em> rating for all movies. We know that the estimate that minimizes the RMSE is the least squares estimate of <span class="math inline">\(\mu\)</span> and, in this case, is the average of all ratings:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-11_c8443a9fe7c9f60e756426727a6fb9aa">
+<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mu</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">y</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>If we predict all unknown ratings with <span class="math inline">\(\hat{\mu}\)</span>, we obtain the following RMSE:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-12_280ca888bd5c5b026202de6a300f85f8">
+<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">rmse</span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">rating</span> <span class="op">-</span> <span class="va">mu</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 1.04</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Because there are thousands of <span class="math inline">\(b_i\)</span> as each movie gets one, the <code><a href="https://rdrr.io/r/stats/lm.html">lm()</a></code> function will be very slow here. We therefore don’t recommend running the code above. But in this particular situation, we know that the least squares estimate <span class="math inline">\(\hat{b}_i\)</span> is just the average of <span class="math inline">\(Y_{u,i} - \hat{\mu}\)</span> for each movie <span class="math inline">\(i\)</span>. So we can compute them this way (we will drop the <code>hat</code> notation in the code to represent estimates going forward):</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-15_568d13d62a15ed58e35653fb164b2974">
-<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">b_i</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colMeans</a></span><span class="op">(</span><span class="va">y</span> <span class="op">-</span> <span class="va">mu</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Keep in mind that if you plug in any other number, you get a higher RMSE. For example:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-14_65e8c6158b910ba27b54234a33f7a354">
+<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">rmse</span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">rating</span> <span class="op">-</span> <span class="fl">3</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 1.16</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can see that these estimates vary substantially:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/movie-effects_394ff91d08dd2cedb46dcd346192a061">
-<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">b_i</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>To win the grand prize of $1,000,000, a participating team had to get an RMSE of about 0.857. So we can definitely do better!</p>
+</section><section id="user-effects" class="level2" data-number="23.4"><h2 data-number="23.4" class="anchored" data-anchor-id="user-effects">
+<span class="header-section-number">23.4</span> User effects</h2>
+<p>If we visualize the average rating for each user:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/user-effect-hist_e1e882cc2d1d7a67f0003e571216ae18">
+<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="va">y</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span>, nclass <span class="op">=</span> <span class="fl">30</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="regularization_files/figure-html/movie-effects-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
+<figure class="figure"><p><img src="regularization_files/figure-html/user-effect-hist-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
 </figure>
 </div>
 </div>
 </div>
-<p>Remember <span class="math inline">\(\hat{\mu}=3.5\)</span> so a <span class="math inline">\(b_i = 1.5\)</span> implies a perfect five star rating.</p>
-<p>Let’s see how much our prediction improves once we use <span class="math inline">\(\hat{y}_{u,i} = \hat{\mu} + \hat{b}_i\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-16_d1e646a5371fe28e215567e4d41e312b">
-<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit_movies</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>movieId <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/integer.html">as.integer</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span>, </span>
-<span>                         mu <span class="op">=</span> <span class="va">mu</span>, b_i <span class="op">=</span> <span class="va">b_i</span><span class="op">)</span></span>
-<span><span class="fu">left_join</span><span class="op">(</span><span class="va">test_set</span>, <span class="va">fit_movies</span>, by <span class="op">=</span> <span class="st">"movieId"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">mutate</span><span class="op">(</span>pred <span class="op">=</span> <span class="va">mu</span> <span class="op">+</span> <span class="va">b_i</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">summarize</span><span class="op">(</span>rmse <span class="op">=</span> <span class="fu">RMSE</span><span class="op">(</span><span class="va">rating</span>, <span class="va">pred</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="co">#&gt;    rmse</span></span>
-<span><span class="co">#&gt; 1 0.991</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>we notice that there is substantial variability across users: some users are very cranky and others love most movies. To account for this, we can use a linear model with a <em>treatment effect</em> <span class="math inline">\(\alpha_i\)</span> for each user. The sum <span class="math inline">\(\mu+\alpha_i\)</span> can be interpreted as the typical rating user <span class="math inline">\(i\)</span> gives to movies. We can write the model as:</p>
+<p><span class="math display">\[
+Y_{i,j} = \mu + \alpha_i + \varepsilon_{i,j}
+\]</span></p>
+<p>Statistics textbooks refer to the <span class="math inline">\(\alpha\)</span>s as treatment effects. In the Netflix challenge papers, they refer to them as <em>bias</em>.</p>
+<p>We can again use least squares to estimate the <span class="math inline">\(\alpha_i\)</span> in the following way:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-15_a4df3613c95fab446863fbe1267431bd">
+<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">rating</span> <span class="op">~</span> <span class="va">userId</span>, data <span class="op">=</span> <span class="va">train_set</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We already see an improvement. But can we make it better?</p>
-</section><section id="user-effects" class="level2" data-number="22.5"><h2 data-number="22.5" class="anchored" data-anchor-id="user-effects">
-<span class="header-section-number">22.5</span> User effects</h2>
-<p>Let’s compute the average rating for user <span class="math inline">\(u\)</span> for those that have rated 100 or more movies:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/user-effect-hist_60d99dccfc376b9f6a4bdd02c6315beb">
-<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">b_u</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="va">y</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">b_u</span>, nclass <span class="op">=</span> <span class="fl">30</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="regularization_files/figure-html/user-effect-hist-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
+<p>Because there are hundreds of <span class="math inline">\(\alpha_i\)</span>, as each movie gets one, the <code><a href="https://rdrr.io/r/stats/lm.html">lm()</a></code> function will be very slow here. In this case, we can show that the least squares estimate <span class="math inline">\(\hat{\alpha}_i\)</span> is just the average of <span class="math inline">\(y_{i,j} - \hat{\mu}\)</span> for each user <span class="math inline">\(i\)</span>. So we can compute them this way:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-16_2e84c1883a801b975b7b358300749d5c">
+<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">a</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="va">y</span> <span class="op">-</span> <span class="va">mu</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
+<p>Note that going forward, we drop the <code>hat</code> notation in the code to represent estimates.</p>
+<p>Let’s see how much our prediction improves once we use <span class="math inline">\(\hat{y}_{i,j} = \hat{\mu} + \hat{\alpha}_i\)</span>. Because we know ratings can’t be below 0.5 or above 5, we define the function <code>clamp</code>:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-17_6a81ce38a9eb921e82da8332e65eb0c7">
+<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">clamp</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">x</span>, <span class="va">min</span> <span class="op">=</span> <span class="fl">0.5</span>, <span class="va">max</span> <span class="op">=</span> <span class="fl">5</span><span class="op">)</span> <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">pmax</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">pmin</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">max</span><span class="op">)</span>, <span class="va">min</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
+<p>to keep predictions in that range and then compute the RMSE:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-18_84ccd15bd6a7e5f83f710c404f188835">
+<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">test_set</span> <span class="op">|&gt;</span> </span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html">left_join</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>userId <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">a</span><span class="op">)</span>, a <span class="op">=</span> <span class="va">a</span><span class="op">)</span>, by <span class="op">=</span> <span class="st">"userId"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>resid <span class="op">=</span> <span class="va">rating</span> <span class="op">-</span> <span class="fu">clamp</span><span class="op">(</span><span class="va">mu</span> <span class="op">+</span> <span class="va">a</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">resid</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu">rmse</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 0.958</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Notice that there is substantial variability across users as well: some users are very cranky and others love every movie. This implies that a further improvement to our model may be:</p>
+<p>We already see an improvement. But can we make it better?</p>
+</section><section id="movie-effects" class="level2" data-number="23.5"><h2 data-number="23.5" class="anchored" data-anchor-id="movie-effects">
+<span class="header-section-number">23.5</span> Movie effects</h2>
+<p>We know from experience that some movies are just generally rated higher than others. We can use a linear model with a <em>treatment effect</em> <span class="math inline">\(\beta_j\)</span> for each movie, which can be interpreted as movie effect or the difference between the average ranking for movie <span class="math inline">\(j\)</span> and the overall average <span class="math inline">\(\mu\)</span>:</p>
 <p><span class="math display">\[
-Y_{u,i} = \mu + b_i + b_u + \varepsilon_{u,i}
+Y_{i,j} = \mu + \alpha_i + \beta_j +\varepsilon_{i,j}
 \]</span></p>
-<p>where <span class="math inline">\(b_u\)</span> is a user-specific effect. Now if a cranky user (negative <span class="math inline">\(b_u\)</span>) rates a great movie (positive <span class="math inline">\(b_i\)</span>), the effects counter each other and we may be able to correctly predict that this user gave this great movie a 3 rather than a 5.</p>
-<p>To fit this model, we could again use <code>lm</code> like this:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-18_cac1e159bdb1cce443b0882a8034f0c9">
-<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">rating</span> <span class="op">~</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">as.factor</a></span><span class="op">(</span><span class="va">movieId</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">as.factor</a></span><span class="op">(</span><span class="va">userId</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>We can again use least squares to estimate the <span class="math inline">\(b_i\)</span> in the following way:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-20_6f7bea4e589956e2313a6e3ec633afd0">
+<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">rating</span> <span class="op">~</span> <span class="va">userId</span> <span class="op">+</span> <span class="va">movieId</span>, data <span class="op">=</span> <span class="va">train_set</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>but, for the reasons described earlier, we won’t. Instead, we will compute an approximation by computing <span class="math inline">\(\hat{\mu}\)</span> and <span class="math inline">\(\hat{b}_i\)</span> and estimating <span class="math inline">\(\hat{b}_u\)</span> as the average of <span class="math inline">\(y_{u,i} - \hat{\mu} - \hat{b}_i\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-19_57e32e3381d07f74e3dbcbf83f7fd71c">
-<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">b_u</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">y</span> <span class="op">-</span> <span class="va">mu</span>, <span class="fl">2</span>, <span class="va">b_i</span><span class="op">)</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>However, this code generates a very large matrix with all the indicator variables needed to represent all the movies and the code will take time to run. We instead use an approximation by first computing the least square estimate <span class="math inline">\(\hat{\mu}\)</span> and <span class="math inline">\(\hat{\alpha}_i\)</span>, and then estimating <span class="math inline">\(\hat{\beta}_j\)</span> as the average of the residuals <span class="math inline">\(y_{i,j} - \hat{\mu} - \hat{\alpha}_i\)</span>:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-21_53ac759e1327d7a5529d98a21c955b2f">
+<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">b</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colMeans</a></span><span class="op">(</span><span class="va">y</span> <span class="op">-</span> <span class="va">mu</span> <span class="op">-</span> <span class="va">a</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We can now construct predictors and see how much the RMSE improves:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-20_558478903ec901d2ad5ea19c81ea9cd0">
-<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit_users</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>userId <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/integer.html">as.integer</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">rownames</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span>, b_u <span class="op">=</span> <span class="va">b_u</span><span class="op">)</span></span>
-<span></span>
-<span><span class="fu">left_join</span><span class="op">(</span><span class="va">test_set</span>, <span class="va">fit_movies</span>, by <span class="op">=</span> <span class="st">"movieId"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">left_join</span><span class="op">(</span><span class="va">fit_users</span>, by <span class="op">=</span> <span class="st">"userId"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">mutate</span><span class="op">(</span>pred <span class="op">=</span> <span class="va">mu</span> <span class="op">+</span> <span class="va">b_i</span> <span class="op">+</span> <span class="va">b_u</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">summarize</span><span class="op">(</span>rmse <span class="op">=</span> <span class="fu">RMSE</span><span class="op">(</span><span class="va">rating</span>, <span class="va">pred</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="co">#&gt;   rmse</span></span>
-<span><span class="co">#&gt; 1 0.91</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-</section><section id="penalized-least-squares" class="level2" data-number="22.6"><h2 data-number="22.6" class="anchored" data-anchor-id="penalized-least-squares">
-<span class="header-section-number">22.6</span> Penalized least squares</h2>
-<p>Let’s look at the top 3 movies, based on our estimates of the movie effect <span class="math inline">\(b_i\)</span>, along with the number of ratings this rating was based on. Several movies get a perfect score. Here are the ones with more than 1 rating:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-22_9d2e9abac05fbde96ff3e9e1a5ab1736">
-<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span>  <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colSums</a></span><span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="va">fit_movies</span><span class="op">$</span><span class="va">n</span> <span class="op">&lt;-</span> <span class="va">n</span></span>
-<span><span class="va">best</span> <span class="op">&lt;-</span> <span class="va">fit_movies</span> <span class="op">|&gt;</span> <span class="fu">left_join</span><span class="op">(</span><span class="va">movie_map</span>, by <span class="op">=</span> <span class="st">"movieId"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">mutate</span><span class="op">(</span>average_rating <span class="op">=</span> <span class="va">mu</span> <span class="op">+</span> <span class="va">b_i</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/stats/filter.html">filter</a></span><span class="op">(</span><span class="va">average_rating</span> <span class="op">==</span> <span class="fl">5</span> <span class="op">&amp;</span> <span class="va">n</span> <span class="op">&gt;</span> <span class="fl">1</span><span class="op">)</span> </span>
-<span><span class="va">test_set</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">group_by</span><span class="op">(</span><span class="va">movieId</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu">summarize</span><span class="op">(</span>test_set_averge_rating <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">rating</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu">right_join</span><span class="op">(</span><span class="va">best</span>, by <span class="op">=</span> <span class="st">"movieId"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu">select</span><span class="op">(</span><span class="va">title</span>, <span class="va">average_rating</span>, <span class="va">n</span>, <span class="va">test_set_averge_rating</span><span class="op">)</span> </span>
-<span><span class="co">#&gt; # A tibble: 5 × 4</span></span>
-<span><span class="co">#&gt;   title                 average_rating     n test_set_averge_rating</span></span>
-<span><span class="co">#&gt;   &lt;chr&gt;                          &lt;dbl&gt; &lt;dbl&gt;                  &lt;dbl&gt;</span></span>
-<span><span class="co">#&gt; 1 Mother Night                       5     2                    4  </span></span>
-<span><span class="co">#&gt; 2 Village of the Damned              5     3                    3.5</span></span>
-<span><span class="co">#&gt; 3 Face in the Crowd, A               5     3                    5  </span></span>
-<span><span class="co">#&gt; 4 Pawnbroker, The                    5     2                    4  </span></span>
-<span><span class="co">#&gt; 5 In a Lonely Place                  5     2                    4.5</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>These all seem like obscure movies. Do we really think these are the top 3 movies in our database? Will this prediction hold on the test set? Note that all, except one, are lower in the test, some considerable lower.</p>
-<p>These supposed <em>best</em> movies were rated by very few users and small sample sizes lead to uncertainty. Therefore, larger estimates of <span class="math inline">\(b_i\)</span>, negative or positive, are more likely. Therefore, these are noisy estimates that we should not trust, especially when it comes to prediction. Large errors can increase our RMSE, so we would rather be conservative when unsure.</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-22_1dc88c6ff35feea93008ac9045fafa5a">
+<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">test_set</span> <span class="op">|&gt;</span> </span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html">left_join</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>userId <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">a</span><span class="op">)</span>, a <span class="op">=</span> <span class="va">a</span><span class="op">)</span>, by <span class="op">=</span> <span class="st">"userId"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html">left_join</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>movieId <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">b</span><span class="op">)</span>, b <span class="op">=</span> <span class="va">b</span><span class="op">)</span>, by <span class="op">=</span> <span class="st">"movieId"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>resid <span class="op">=</span> <span class="va">rating</span> <span class="op">-</span> <span class="fu">clamp</span><span class="op">(</span><span class="va">mu</span> <span class="op">+</span> <span class="va">a</span> <span class="op">+</span> <span class="va">b</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">resid</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu">rmse</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 0.911</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+</section><section id="penalized-least-squares" class="level2" data-number="23.6"><h2 data-number="23.6" class="anchored" data-anchor-id="penalized-least-squares">
+<span class="header-section-number">23.6</span> Penalized least squares</h2>
+<p>If we look at the top movies based on our estimates of the movie effect <span class="math inline">\(\hat{\beta}_j\)</span>, we find that they all obscure movies with just one rating:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-24_84c3bf6e239c8f09589cc56982fdc6c9">
+<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colSums</a></span><span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">ind</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/which.html">which</a></span><span class="op">(</span><span class="va">b</span> <span class="op">==</span> <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="va">b</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">movie_map</span>, <span class="va">movieId</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">b</span><span class="op">)</span><span class="op">[</span><span class="va">ind</span><span class="op">]</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">title</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] "Prisoner of the Mountains (Kavkazsky plennik)"</span></span>
+<span><span class="co">#&gt; [2] "Dream With the Fishes"                        </span></span>
+<span><span class="co">#&gt; [3] "Storefront Hitchcock"                         </span></span>
+<span><span class="co">#&gt; [4] "Anatomy (Anatomie)"                           </span></span>
+<span><span class="co">#&gt; [5] "Two Ninas"                                    </span></span>
+<span><span class="co">#&gt; [6] "Erik the Viking"                              </span></span>
+<span><span class="co">#&gt; [7] "Grass Is Greener, The"                        </span></span>
+<span><span class="co">#&gt; [8] "Caveman"</span></span>
+<span><span class="va">n</span><span class="op">[</span><span class="va">ind</span><span class="op">]</span></span>
+<span><span class="co">#&gt; 1450 1563 1819 3892 4076 4591 4796 5427 </span></span>
+<span><span class="co">#&gt;    1    1    1    1    1    1    1    1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Do we really think these are the top movies in our database? The one of these that appears in our test set receives a terrible rating:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-25_4041f794a984304f0dad17c8c4c9ddbb">
+<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">test_set</span>, <span class="va">movieId</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">b</span><span class="op">)</span><span class="op">[</span><span class="va">ind</span><span class="op">]</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">title</span>, <span class="va">movieId</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>rating <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">rating</span><span class="op">)</span>, .groups <span class="op">=</span> <span class="st">"drop"</span><span class="op">)</span></span>
+<span><span class="co">#&gt; # A tibble: 1 × 3</span></span>
+<span><span class="co">#&gt;   title              movieId rating</span></span>
+<span><span class="co">#&gt;   &lt;chr&gt;              &lt;fct&gt;    &lt;dbl&gt;</span></span>
+<span><span class="co">#&gt; 1 Anatomy (Anatomie) 3892         1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Large estimates, negative or positive, should not trusted when based on a small number of ratings. Because large errors can increase our RMSE, we would rather be conservative when unsure.</p>
 <p>In previous sections, we computed standard error and constructed confidence intervals to account for different levels of uncertainty. However, when making predictions, we need one number, one prediction, not an interval. For this, we introduce the concept of regularization.</p>
-<p>Regularization permits us to penalize large estimates that are formed using small sample sizes. It has commonalities with the Bayesian approach that shrunk predictions described in Section <a href="../inference/bayes.html"><span>Chapter&nbsp;11</span></a>.</p>
-<p>The general idea behind regularization is to constrain the total variability of the effect sizes. Why does this help? Consider a case in which we have movie <span class="math inline">\(i=1\)</span> with 100 user ratings and 4 movies <span class="math inline">\(i=2,3,4,5\)</span> with just one user rating. We intend to fit the model</p>
+<p>Regularization permits us to penalize large estimates that are formed using small sample sizes. It has commonalities with the Bayesian approach that shrunk predictions described in <a href="../inference/bayes.html"><span>Chapter&nbsp;12</span></a>.</p>
+<p>The general idea behind regularization is to constrain the total variability of the effect sizes. Why does this help? Consider a case in which we have movie <span class="math inline">\(j=1\)</span> with 100 user ratings and 4 movies <span class="math inline">\(j=2,3,4,5\)</span> with just one user rating. Suppose we know the average rating is, say, <span class="math inline">\(\mu = 3\)</span>. If we use least squares, the estimate for the first movie effect is the average of 100 user ratings, which we expect to be quite precise. However, the estimate for movies 2, 3, 4, and 5 will be based on one observation. Note that because the average is based on a single observation, the error for <span class="math inline">\(j=2,3,4,5\)</span> is 0, but we don’t expect to be this lucky next time, when asked to predict. In fact, ignoring the one user and guessing that movies 2,3,4, and 5 are just average movies might provide a better prediction. The general idea of penalized regression is to control the total variability of the movie effects: <span class="math inline">\(\sum_{j=1}^5 \beta_j^2\)</span>. Specifically, instead of minimizing the least squares equation, we minimize an equation that adds a penalty:</p>
 <p><span class="math display">\[
-Y_{u,i} = \mu + b_i + \varepsilon_{u,i}
-\]</span></p>
-<p>Suppose we know the average rating is, say, <span class="math inline">\(\mu = 3\)</span>. If we use least squares, the estimate for the first movie effect <span class="math inline">\(b_1\)</span> is the average of the 100 user ratings, <span class="math inline">\(1/100 \sum_{i=1}^{100} (Y_{i,1} - \mu)\)</span>, which we expect to be a quite precise. However, the estimate for movies 2, 3, 4, and 5 will simply be the observed deviation from the average rating <span class="math inline">\(\hat{b}_i = Y_{u,i} - \hat{\mu}\)</span> which is an estimate based on just one number so it won’t be precise at all. Note these estimates make the error <span class="math inline">\(Y_{u,i} - \mu + \hat{b}_i\)</span> equal to 0 for <span class="math inline">\(i=2,3,4,5\)</span>, but this is a case of over-training. In fact, ignoring the one user and guessing that movies 2,3,4, and 5 are just average movies (<span class="math inline">\(b_i = 0\)</span>) might provide a better prediction. The general idea of penalized regression is to control the total variability of the movie effects: <span class="math inline">\(\sum_{i=1}^5 b_i^2\)</span>. Specifically, instead of minimizing the least squares equation, we minimize an equation that adds a penalty:</p>
-<p><span class="math display">\[ \sum_{u,i} \left(y_{u,i} - \mu - b_i\right)^2 + \lambda \sum_{i} b_i^2 \]</span> The first term is just the sum of squares and the second is a penalty that gets larger when many <span class="math inline">\(b_i\)</span> are large. Using calculus we can actually show that the values of <span class="math inline">\(b_i\)</span> that minimize this equation are:</p>
+\sum_{i,j} \left(y_{u,i} - \mu - \alpha_i - \beta_j \right)^2 + \lambda \sum_{j} \beta_j^2
+\]</span> The first term is just the sum of squares and the second is a penalty that gets larger when many <span class="math inline">\(\beta_i\)</span>s are large. Using calculus, we can actually show that the values of <span class="math inline">\(\beta_i\)</span> that minimize this equation are:</p>
 <p><span class="math display">\[
-\hat{b}_i(\lambda) = \frac{1}{\lambda + n_i} \sum_{u=1}^{n_i} \left(Y_{u,i} - \hat{\mu}\right)
+\hat{\beta}_j(\lambda) = \frac{1}{\lambda + n_j} \sum_{i=1}^{n_i} \left(Y_{i,j} - \mu - \alpha_i\right)
 \]</span></p>
-<p>where <span class="math inline">\(n_i\)</span> is the number of ratings made for movie <span class="math inline">\(i\)</span>. This approach will have our desired effect: when our sample size <span class="math inline">\(n_i\)</span> is very large, a case which will give us a stable estimate, then the penalty <span class="math inline">\(\lambda\)</span> is effectively ignored since <span class="math inline">\(n_i+\lambda \approx n_i\)</span>. However, when the <span class="math inline">\(n_i\)</span> is small, then the estimate <span class="math inline">\(\hat{b}_i(\lambda)\)</span> is shrunken towards 0. The larger <span class="math inline">\(\lambda\)</span>, the more we shrink.</p>
-<p>To select <span class="math inline">\(\lambda\)</span>, we can use cross validation:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-23_9993a68b9cb155bb1eff6f0a4813101a">
-<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">lambdas</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">10</span>, <span class="fl">0.1</span><span class="op">)</span></span>
-<span></span>
-<span><span class="va">sums</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colSums</a></span><span class="op">(</span><span class="va">y</span> <span class="op">-</span> <span class="va">mu</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
+<p>where <span class="math inline">\(n_j\)</span> is the number of ratings made for movie <span class="math inline">\(j\)</span>.</p>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>When we estimate the parameters of a linear model with penalized least squares, we refer to the approach as <em>ridge regression</em>. The <code>lm.ridge</code> function in the <strong>MASS</strong> package can perform the estimation. We don’t use it here due to the large numbers of parameters associated with movie effects.</p>
+</div>
+</div>
+</div>
+<p>This approach will have our desired effect: when our sample size <span class="math inline">\(n_j\)</span> is very large, we obtain a stable estimate and the penalty <span class="math inline">\(\lambda\)</span> is effectively ignored since <span class="math inline">\(n_j+\lambda \approx n_j\)</span>. Yet when the <span class="math inline">\(n_j\)</span> is small, then the estimate <span class="math inline">\(\hat{\beta}_i(\lambda)\)</span> is shrunken towards 0. The larger the <span class="math inline">\(\lambda\)</span>, the more we shrink.</p>
+<p>But how do we select <span class="math inline">\(\lambda\)</span>? In <a href="../ml/cross-validation.html"><span>Chapter&nbsp;29</span></a>, we describe an approach to do this. Here we will simply compute the RMSE we for different values of <span class="math inline">\(\lambda\)</span> to illustrate the effect:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-26_b9a1a97986e3cbf4854587c4d64238bb">
+<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colSums</a></span><span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">sums</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colSums</a></span><span class="op">(</span><span class="va">y</span> <span class="op">-</span> <span class="va">mu</span> <span class="op">-</span> <span class="va">a</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
+<span><span class="va">lambdas</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">10</span>, <span class="fl">0.1</span><span class="op">)</span></span>
 <span><span class="va">rmses</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">lambdas</span>, <span class="kw">function</span><span class="op">(</span><span class="va">lambda</span><span class="op">)</span><span class="op">{</span></span>
-<span>  <span class="va">b_i</span> <span class="op">&lt;-</span>  <span class="va">sums</span> <span class="op">/</span> <span class="op">(</span><span class="va">n</span> <span class="op">+</span> <span class="va">lambda</span><span class="op">)</span></span>
-<span>  <span class="va">fit_movies</span><span class="op">$</span><span class="va">b_i</span> <span class="op">&lt;-</span> <span class="va">b_i</span></span>
-<span>  <span class="fu">left_join</span><span class="op">(</span><span class="va">test_set</span>, <span class="va">fit_movies</span>, by <span class="op">=</span> <span class="st">"movieId"</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu">mutate</span><span class="op">(</span>pred <span class="op">=</span> <span class="va">mu</span> <span class="op">+</span> <span class="va">b_i</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>    <span class="fu">summarize</span><span class="op">(</span>rmse <span class="op">=</span> <span class="fu">RMSE</span><span class="op">(</span><span class="va">rating</span>, <span class="va">pred</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>    <span class="fu">pull</span><span class="op">(</span><span class="va">rmse</span><span class="op">)</span></span>
+<span>  <span class="va">b</span> <span class="op">&lt;-</span>  <span class="va">sums</span> <span class="op">/</span> <span class="op">(</span><span class="va">n</span> <span class="op">+</span> <span class="va">lambda</span><span class="op">)</span></span>
+<span>  <span class="va">test_set</span> <span class="op">|&gt;</span> </span>
+<span>    <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html">left_join</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>userId <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">a</span><span class="op">)</span>, a <span class="op">=</span> <span class="va">a</span><span class="op">)</span>, by <span class="op">=</span> <span class="st">"userId"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>    <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html">left_join</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>movieId <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">b</span><span class="op">)</span>, b <span class="op">=</span> <span class="va">b</span><span class="op">)</span>, by <span class="op">=</span> <span class="st">"movieId"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>    <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>resid <span class="op">=</span> <span class="va">rating</span> <span class="op">-</span> <span class="fu">clamp</span><span class="op">(</span><span class="va">mu</span> <span class="op">+</span> <span class="va">a</span> <span class="op">+</span> <span class="va">b</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">resid</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu">rmse</span><span class="op">(</span><span class="op">)</span></span>
 <span><span class="op">}</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can then select the value that minimizes the RMSE:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/best-penalty_f668ca68c741873dd386f0a1b626ae4b">
-<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">lambdas</span>, <span class="va">rmses</span>, type <span class="op">=</span> <span class="st">"l"</span><span class="op">)</span></span>
-<span><span class="va">lambda</span> <span class="op">&lt;-</span> <span class="va">lambdas</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/which.min.html">which.min</a></span><span class="op">(</span><span class="va">rmses</span><span class="op">)</span><span class="op">]</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/print.html">print</a></span><span class="op">(</span><span class="va">lambda</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 3.1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Here is a plot of the RMSE versus <span class="math inline">\(\lambda\)</span>:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/best-penalty_46b3afbe58c4f1f0e1f2e0e684268195">
+<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">lambdas</span>, <span class="va">rmses</span>, type <span class="op">=</span> <span class="st">"l"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="regularization_files/figure-html/best-penalty-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -741,12 +758,14 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Once we select a <span class="math inline">\(\lambda\)</span> we can compute the regularized estimates and add to our table of estimates:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-24_195c86df142248a4c35b830a23731847">
-<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit_movies</span><span class="op">$</span><span class="va">b_i_reg</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colSums</a></span><span class="op">(</span><span class="va">y</span> <span class="op">-</span> <span class="va">mu</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> <span class="op">/</span> <span class="op">(</span><span class="va">n</span> <span class="op">+</span> <span class="va">lambda</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>The minimum is obtained for <span class="math inline">\(\lambda=\)</span> 3.2</p>
+<p>Using this <span class="math inline">\(\lambda\)</span>, we can compute the regularized estimates and add to our table of estimates:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-27_1218b16ce29b8922d06686de49c31e8c">
+<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">lambda</span> <span class="op">&lt;-</span> <span class="va">lambdas</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/which.min.html">which.min</a></span><span class="op">(</span><span class="va">rmses</span><span class="op">)</span><span class="op">]</span> </span>
+<span><span class="va">b_reg</span> <span class="op">&lt;-</span> <span class="va">sums</span> <span class="op">/</span> <span class="op">(</span><span class="va">n</span> <span class="op">+</span> <span class="va">lambda</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>To see how the estimates shrink, let’s make a plot of the regularized estimates versus the least squares estimates.</p>
-<div class="cell" data-layout-align="center" data-fig.asp="0.8" data-hash="regularization_cache/html/regularization-shrinkage_d30086a49bdca391052e16d8479b15c3">
+<div class="cell" data-layout-align="center" data-fig.asp="0.8" data-hash="regularization_cache/html/regularization-shrinkage_28f5a1758e0bd4f04aea931daf8a5551">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="regularization_files/figure-html/regularization-shrinkage-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -755,61 +774,70 @@ <h1 class="title">
 </div>
 </div>
 <p>Now, let’s look at the top 5 best movies based on the penalized estimates <span class="math inline">\(\hat{b}_i(\lambda)\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-25_c95dacc42aca3c45fc27aab72b91c783">
-<pre><code>#&gt; # A tibble: 5 × 4
-#&gt;   title                     average_rating     n test_set_averge_rating
-#&gt;   &lt;chr&gt;                              &lt;dbl&gt; &lt;dbl&gt;                  &lt;dbl&gt;
-#&gt; 1 Shawshank Redemption, The           4.49   244                   4.43
-#&gt; 2 Godfather, The                      4.47   163                   4.5 
-#&gt; 3 Thin Man, The                       4.40    10                   3.57
-#&gt; 4 African Queen, The                  4.38    37                   4.35
-#&gt; 5 Roger &amp; Me                          4.37    35                   4.14</code></pre>
-</div>
-<p>These make much more sense! These movies are watched more and have more ratings.</p>
-<p>Do we improve our results? Let’s estimate the user effects with the new movie effect estimates and compute the new RMSE:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-26_9588255750345c087de37811d1dac477">
-<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit_users</span><span class="op">$</span><span class="va">b_u</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">y</span> <span class="op">-</span> <span class="va">mu</span>, <span class="fl">2</span>, <span class="va">b_i</span><span class="op">)</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
-<span><span class="fu">left_join</span><span class="op">(</span><span class="va">test_set</span>, <span class="va">fit_movies</span>, by <span class="op">=</span> <span class="st">"movieId"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">left_join</span><span class="op">(</span><span class="va">fit_users</span>, by <span class="op">=</span> <span class="st">"userId"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">mutate</span><span class="op">(</span>pred <span class="op">=</span> <span class="va">mu</span> <span class="op">+</span> <span class="va">b_i_reg</span> <span class="op">+</span> <span class="va">b_u</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu">summarize</span><span class="op">(</span>rmse <span class="op">=</span> <span class="fu">RMSE</span><span class="op">(</span><span class="va">rating</span>, <span class="va">pred</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="co">#&gt;    rmse</span></span>
-<span><span class="co">#&gt; 1 0.887</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-28_b3b8cbc6aa2db35748ecc7278cbc24bc">
+<pre><code>#&gt; # A tibble: 10 × 5
+#&gt;   title                      year rating b_reg     n
+#&gt;   &lt;chr&gt;                     &lt;int&gt;  &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt;
+#&gt; 1 Shawshank Redemption, The  1994   4.57 0.793   138
+#&gt; 2 Chinatown                  1974   4.5  0.734    51
+#&gt; 3 Dangerous Beauty           1998   4.5  0.764     2
+#&gt; 4 Godfather, The             1972   4.45 0.774   107
+#&gt; 5 Usual Suspects, The        1995   4.40 0.752   109
+#&gt; # ℹ 5 more rows</code></pre>
+</div>
+<p>These make more sense with some movies that are watched more and have more ratings in the training set.</p>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>Notice <em>Swinger</em> has a lower rating than the other top 10, yet a large movie effect estimate. This is due to the fact that it was rated by harsher users.</p>
+</div>
+</div>
+</div>
+<p>Note that regularization improves our RMSE:</p>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-29_33094ad486a56cf4db7c7406bd5a7cd6">
+<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">test_set</span> <span class="op">|&gt;</span> </span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html">left_join</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>userId <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">a</span><span class="op">)</span>, a <span class="op">=</span> <span class="va">a</span><span class="op">)</span>, by <span class="op">=</span> <span class="st">"userId"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html">left_join</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>movieId <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">b_reg</span><span class="op">)</span>, b_reg <span class="op">=</span> <span class="va">b_reg</span><span class="op">)</span>, by <span class="op">=</span> <span class="st">"movieId"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>resid <span class="op">=</span> <span class="va">rating</span> <span class="op">-</span> <span class="fu">clamp</span><span class="op">(</span><span class="va">mu</span> <span class="op">+</span> <span class="va">a</span> <span class="op">+</span> <span class="va">b_reg</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">resid</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu">rmse</span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 0.889</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The penalized estimates provide an improvement over the least squares estimates:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-28_8b2be4f2fc375f4cc32de78239430093">
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-32_9ee9ea1b7605654669b314e864cf8aad">
 <div class="cell-output-display">
 <table class="table table-striped table-sm small" data-quarto-postprocess="true">
 <thead><tr class="header">
-<th style="text-align: left;" data-quarto-table-cell-role="th">method</th>
+<th style="text-align: left;" data-quarto-table-cell-role="th">model</th>
 <th style="text-align: right;" data-quarto-table-cell-role="th">RMSE</th>
 </tr></thead>
 <tbody>
 <tr class="odd">
-<td style="text-align: left;">Just the average</td>
-<td style="text-align: right;">1.051</td>
+<td style="text-align: left;">Just the mean</td>
+<td style="text-align: right;">1.043</td>
 </tr>
 <tr class="even">
-<td style="text-align: left;">Movie Effect Model</td>
-<td style="text-align: right;">0.991</td>
+<td style="text-align: left;">User effect</td>
+<td style="text-align: right;">0.958</td>
 </tr>
 <tr class="odd">
-<td style="text-align: left;">Movie + User Effects Model</td>
-<td style="text-align: right;">0.910</td>
+<td style="text-align: left;">User + movie effect</td>
+<td style="text-align: right;">0.911</td>
 </tr>
 <tr class="even">
-<td style="text-align: left;">Regularized Movie + User Effect Model</td>
-<td style="text-align: right;">0.887</td>
+<td style="text-align: left;">User + regularized movie effect</td>
+<td style="text-align: right;">0.889</td>
 </tr>
 </tbody>
 </table>
 </div>
 </div>
-</section><section id="exercises" class="level2" data-number="22.7"><h2 data-number="22.7" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">22.7</span> Exercises</h2>
-<p>1. For the <code>movielens</code> data, compute the number of ratings for each movie and then plot it against the year the movie came out. Use the square root transformation on the counts.</p>
-<p>2. We see that, on average, movies that came out after 1993 get more ratings. We also see that with newer movies, starting in 1993, the number of ratings decreases with year: the more recent a movie is, the less time users have had to rate it.</p>
-<p>Among movies that came out in 1993 or later, what are the 25 movies with the most ratings per year? Also report their average rating.</p>
+</section><section id="exercises" class="level2" data-number="23.7"><h2 data-number="23.7" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">23.7</span> Exercises</h2>
+<p>1. For the <code>movielens</code> data, compute the number of ratings for each movie and then plot it against the year the movie was released. Use the square root transformation on the counts.</p>
+<p>2. We see that, on average, movies that were released after 1993 get more ratings. We also see that with newer movies, starting in 1993, the number of ratings decreases with year: the more recent a movie is, the less time users have had to rate it.</p>
+<p>Among movies that came out in 1993 or later, what are the 25 movies with the most ratings per year? Also, report their average rating.</p>
 <p>3. From the table constructed in the previous example, we see that the most rated movies tend to have above average ratings. This is not surprising: more people watch popular movies. To confirm this, stratify the post 1993 movies by ratings per year and compute their average ratings. Make a plot of average rating versus ratings per year and show an estimate of the trend.</p>
 <p>4. In the previous exercise, we see that the more a movie is rated, the higher the rating. Suppose you are doing a predictive analysis in which you need to fill in the missing ratings with some value. Which of the following strategies would you use?</p>
 <ol type="a">
@@ -818,39 +846,39 @@ <h1 class="title">
 <li>Fill in the value with a lower value than the average since lack of rating is associated with lower ratings. Try out different values and evaluate prediction in a test set.</li>
 <li>None of the above.</li>
 </ol>
-<p>5. The <code>movielens</code> dataset also includes a time stamp. This variable represents the time and data in which the rating was provided. The units are seconds since January 1, 1970. Create a new column <code>date</code> with the date. Hint: use the <code>as_datetime</code> function in the <strong>lubridate</strong> package.</p>
-<p>6. Compute the average rating for each week and plot this average against day. Hint: use the <code>round_date</code> function before you <code>group_by</code>.</p>
+<p>5. The <code>movielens</code> dataset also includes a time stamp. This variable represents the time and data in which the rating was provided. The units are seconds since January 1, 1970. Create a new column <code>date</code> with the date. Hint: Use the <code>as_datetime</code> function in the <strong>lubridate</strong> package.</p>
+<p>6. Compute the average rating for each week and plot this average against day. Hint: Use the <code>round_date</code> function before you <code>group_by</code>.</p>
 <p>7. The plot shows some evidence of a time effect. If we define <span class="math inline">\(d_{u,i}\)</span> as the day for user’s <span class="math inline">\(u\)</span> rating of movie <span class="math inline">\(i\)</span>, which of the following models is most appropriate:</p>
 <ol type="a">
 <li>
-<span class="math inline">\(Y_{u,i} = \mu + b_i + b_u + d_{u,i} + \varepsilon_{u,i}\)</span>.</li>
+<span class="math inline">\(Y_{u,i} = \mu + b_i + \beta_j + d_{u,i} + \varepsilon_{u,i}\)</span>.</li>
 <li>
-<span class="math inline">\(Y_{u,i} = \mu + b_i + b_u + d_{u,i}\beta + \varepsilon_{u,i}\)</span>.</li>
+<span class="math inline">\(Y_{u,i} = \mu + b_i + \beta_j + d_{u,i}\beta + \varepsilon_{u,i}\)</span>.</li>
 <li>
-<span class="math inline">\(Y_{u,i} = \mu + b_i + b_u + d_{u,i}\beta_i + \varepsilon_{u,i}\)</span>.</li>
+<span class="math inline">\(Y_{u,i} = \mu + b_i + \beta_j + d_{u,i}\beta_i + \varepsilon_{u,i}\)</span>.</li>
 <li>
-<span class="math inline">\(Y_{u,i} = \mu + b_i + b_u + f(d_{u,i}) + \varepsilon_{u,i}\)</span>, with <span class="math inline">\(f\)</span> a smooth function of <span class="math inline">\(d_{u,i}\)</span>.</li>
+<span class="math inline">\(Y_{u,i} = \mu + b_i + \beta_j + f(d_{u,i}) + \varepsilon_{u,i}\)</span>, with <span class="math inline">\(f\)</span> a smooth function of <span class="math inline">\(d_{u,i}\)</span>.</li>
 </ol>
 <p>8. The <code>movielens</code> data also has a <code>genres</code> column. This column includes every genre that applies to the movie. Some movies fall under several genres. Define a category as whatever combination appears in this column. Keep only categories with more than 1,000 ratings. Then compute the average and standard error for each category. Plot these as error bar plots.</p>
 <p>9. The plot shows strong evidence of a genre effect. If we define <span class="math inline">\(g_{u,i}\)</span> as the genre for user’s <span class="math inline">\(u\)</span> rating of movie <span class="math inline">\(i\)</span>, which of the following models is most appropriate:</p>
 <ol type="a">
 <li>
-<span class="math inline">\(Y_{u,i} = \mu + b_i + b_u + d_{u,i} + \varepsilon_{u,i}\)</span>.</li>
+<span class="math inline">\(Y_{u,i} = \mu + b_i + \beta_j + d_{u,i} + \varepsilon_{u,i}\)</span>.</li>
 <li>
-<span class="math inline">\(Y_{u,i} = \mu + b_i + b_u + d_{u,i}\beta + \varepsilon_{u,i}\)</span>.</li>
+<span class="math inline">\(Y_{u,i} = \mu + b_i + \beta_j + d_{u,i}\beta + \varepsilon_{u,i}\)</span>.</li>
 <li>
-<span class="math inline">\(Y_{u,i} = \mu + b_i + b_u + \sum_{k=1}^K x_{u,i} \beta_k + \varepsilon_{u,i}\)</span>, with <span class="math inline">\(x^k_{u,i} = 1\)</span> if <span class="math inline">\(g_{u,i}\)</span> is genre <span class="math inline">\(k\)</span>.</li>
+<span class="math inline">\(Y_{u,i} = \mu + b_i + \beta_j + \sum_{k=1}^K x_{u,i} \beta_k + \varepsilon_{u,i}\)</span>, with <span class="math inline">\(x^k_{u,i} = 1\)</span> if <span class="math inline">\(g_{u,i}\)</span> is genre <span class="math inline">\(k\)</span>.</li>
 <li>
-<span class="math inline">\(Y_{u,i} = \mu + b_i + b_u + f(d_{u,i}) + \varepsilon_{u,i}\)</span>, with <span class="math inline">\(f\)</span> a smooth function of <span class="math inline">\(d_{u,i}\)</span>.</li>
+<span class="math inline">\(Y_{u,i} = \mu + b_i + \beta_j + f(d_{u,i}) + \varepsilon_{u,i}\)</span>, with <span class="math inline">\(f\)</span> a smooth function of <span class="math inline">\(d_{u,i}\)</span>.</li>
 </ol>
 <p>An education expert is advocating for smaller schools. The expert bases this recommendation on the fact that among the best performing schools, many are small schools. Let’s simulate a dataset for 100 schools. First, let’s simulate the number of students in each school.</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-29_d674c7e769e9b9030bd7059d61a9a748">
-<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1986</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-33_5991963fcba23aafe91e0a316f00307a">
+<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1986</span><span class="op">)</span></span>
 <span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="fl">2</span><span class="op">^</span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="fl">1000</span>, <span class="fl">8</span>, <span class="fl">1</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Now let’s assign a <em>true</em> quality for each school completely independent from size. This is the parameter we want to estimate.</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-30_5e3c9ec3f71280b8e55d0afcc216c1fc">
-<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mu</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="fl">80</span> <span class="op">+</span> <span class="fl">2</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/stats/TDist.html">rt</a></span><span class="op">(</span><span class="fl">1000</span>, <span class="fl">5</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-34_9b90cf01fb2172c6403ab2749e81652b">
+<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mu</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="fl">80</span> <span class="op">+</span> <span class="fl">2</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/stats/TDist.html">rt</a></span><span class="op">(</span><span class="fl">1000</span>, <span class="fl">5</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/range.html">range</a></span><span class="op">(</span><span class="va">mu</span><span class="op">)</span></span>
 <span><span class="va">schools</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>id <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html">paste</a></span><span class="op">(</span><span class="st">"PS"</span>,<span class="fl">1</span><span class="op">:</span><span class="fl">100</span><span class="op">)</span>, </span>
 <span>                      size <span class="op">=</span> <span class="va">n</span>, </span>
@@ -858,16 +886,16 @@ <h1 class="title">
 <span>                      rank <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/rank.html">rank</a></span><span class="op">(</span><span class="op">-</span><span class="va">mu</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We can see that the top 10 schools are:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-31_5eeb41567ef8640c15359ad4b0591f3a">
-<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">schools</span> <span class="op">|&gt;</span> <span class="fu">top_n</span><span class="op">(</span><span class="fl">10</span>, <span class="va">quality</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu">arrange</span><span class="op">(</span><span class="fu">desc</span><span class="op">(</span><span class="va">quality</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-35_b5466beb7d6f3a5d6ea4c6443316c230">
+<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">schools</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/top_n.html">top_n</a></span><span class="op">(</span><span class="fl">10</span>, <span class="va">quality</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/arrange.html">arrange</a></span><span class="op">(</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/desc.html">desc</a></span><span class="op">(</span><span class="va">quality</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Now let’s have the students in the school take a test. There is random variability in test taking so we will simulate the test scores as normally distributed with the average determined by the school quality and standard deviations of 30 percentage points:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-32_516d5ae8d6121f5dd6c56fa8f3423f98">
-<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">scores</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">schools</span><span class="op">)</span>, <span class="kw">function</span><span class="op">(</span><span class="va">i</span><span class="op">)</span><span class="op">{</span></span>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-36_23163b43825348b093cb750fe940ced4">
+<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">scores</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">schools</span><span class="op">)</span>, <span class="kw">function</span><span class="op">(</span><span class="va">i</span><span class="op">)</span><span class="op">{</span></span>
 <span>  <span class="va">scores</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">schools</span><span class="op">$</span><span class="va">size</span><span class="op">[</span><span class="va">i</span><span class="op">]</span>, <span class="va">schools</span><span class="op">$</span><span class="va">quality</span><span class="op">[</span><span class="va">i</span><span class="op">]</span>, <span class="fl">30</span><span class="op">)</span></span>
 <span>  <span class="va">scores</span></span>
 <span><span class="op">}</span><span class="op">)</span></span>
-<span><span class="va">schools</span> <span class="op">&lt;-</span> <span class="va">schools</span> <span class="op">|&gt;</span> <span class="fu">mutate</span><span class="op">(</span>score <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">scores</span>, <span class="va">mean</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="va">schools</span> <span class="op">&lt;-</span> <span class="va">schools</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>score <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">scores</span>, <span class="va">mean</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>10. What are the top schools based on the average score? Show just the ID, size, and the average score.</p>
 <p>11. Compare the median school size to the median school size of the top 10 schools based on the score.</p>
@@ -875,13 +903,13 @@ <h1 class="title">
 <p>13. The same is true for the worst schools! They are small as well. Plot the average score versus school size to see what’s going on. Highlight the top 10 schools based on the <em>true</em> quality. Use the log scale transform for the size.</p>
 <p>14. We can see that the standard error of the score has larger variability when the school is smaller. This is a basic statistical reality we learned in the probability and inference sections. In fact, note that 4 of the top 10 schools are in the top 10 schools based on the exam score.</p>
 <p>Let’s use regularization to pick the best schools. Remember regularization <em>shrinks</em> deviations from the average towards 0. So to apply regularization here, we first need to define the overall average for all schools:</p>
-<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-33_f150429edabc8870df0f401e65a01caf">
-<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">overall</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">scores</span>, <span class="va">mean</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="regularization_cache/html/unnamed-chunk-37_3cfb46cfa3b8c4cb46b64ea6e86b2350">
+<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">overall</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">scores</span>, <span class="va">mean</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>and then define, for each school, how it deviates from that average. Write code that estimates the score above average for each school but dividing by <span class="math inline">\(n + \lambda\)</span> instead of <span class="math inline">\(n\)</span>, with <span class="math inline">\(n\)</span> the school size and <span class="math inline">\(\lambda\)</span> a regularization parameter. Try <span class="math inline">\(\lambda = 3\)</span>.</p>
+<p>and then define, for each school, how it deviates from that average. Write code that estimates the score above average for each school, but dividing by <span class="math inline">\(n + \lambda\)</span> instead of <span class="math inline">\(n\)</span>, with <span class="math inline">\(n\)</span> the school size and <span class="math inline">\(\lambda\)</span> a regularization parameter. Try <span class="math inline">\(\lambda = 3\)</span>.</p>
 <p>15. Notice that this improves things a bit. The number of small schools that are not highly ranked is now 4. Is there a better <span class="math inline">\(\lambda\)</span>? Find the <span class="math inline">\(\lambda\)</span> that minimizes the RMSE = <span class="math inline">\(1/100 \sum_{i=1}^{100} (\mbox{quality} - \mbox{estimate})^2\)</span>.</p>
 <p>16. Rank the schools based on the average obtained with the best <span class="math inline">\(\alpha\)</span>. Note that no small school is incorrectly included.</p>
-<p>17. A common mistake to make when using regularization is shrinking values towards 0 that are not centered around 0. For example, if we don’t subtract the overall average before shrinking, we actually obtain a very similar result. Confirm this by re-running the code from exercise 6 but without removing the overall mean.</p>
+<p>17. A common mistake to make when using regularization is shrinking values towards 0 that are not centered around 0. For example, if we don’t subtract the overall average before shrinking, we actually obtain a very similar result. Confirm this by re-running the code from exercise 6, but without removing the overall mean.</p>
 
 
 </section><section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes"><hr>
@@ -1122,12 +1150,12 @@ <h1 class="title">
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../highdim/dimension-reduction.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../highdim/matrix-factorization.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/highdim/regularization_files/figure-html/best-penalty-1.png b/docs/highdim/regularization_files/figure-html/best-penalty-1.png
index a4ada28..cd5c238 100644
Binary files a/docs/highdim/regularization_files/figure-html/best-penalty-1.png and b/docs/highdim/regularization_files/figure-html/best-penalty-1.png differ
diff --git a/docs/highdim/regularization_files/figure-html/movie-effects-1.png b/docs/highdim/regularization_files/figure-html/movie-effects-1.png
deleted file mode 100644
index 9242e2b..0000000
Binary files a/docs/highdim/regularization_files/figure-html/movie-effects-1.png and /dev/null differ
diff --git a/docs/highdim/regularization_files/figure-html/movie-id-and-user-hists-1.png b/docs/highdim/regularization_files/figure-html/movie-id-and-user-hists-1.png
index c72bd72..b82619f 100644
Binary files a/docs/highdim/regularization_files/figure-html/movie-id-and-user-hists-1.png and b/docs/highdim/regularization_files/figure-html/movie-id-and-user-hists-1.png differ
diff --git a/docs/highdim/regularization_files/figure-html/regularization-shrinkage-1.png b/docs/highdim/regularization_files/figure-html/regularization-shrinkage-1.png
index e5c5eea..4429e3e 100644
Binary files a/docs/highdim/regularization_files/figure-html/regularization-shrinkage-1.png and b/docs/highdim/regularization_files/figure-html/regularization-shrinkage-1.png differ
diff --git a/docs/highdim/regularization_files/figure-html/sparsity-of-movie-recs-1.png b/docs/highdim/regularization_files/figure-html/sparsity-of-movie-recs-1.png
index 9cd47a6..f982f4a 100644
Binary files a/docs/highdim/regularization_files/figure-html/sparsity-of-movie-recs-1.png and b/docs/highdim/regularization_files/figure-html/sparsity-of-movie-recs-1.png differ
diff --git a/docs/highdim/regularization_files/figure-html/user-effect-hist-1.png b/docs/highdim/regularization_files/figure-html/user-effect-hist-1.png
index fdf0dd2..224db1c 100644
Binary files a/docs/highdim/regularization_files/figure-html/user-effect-hist-1.png and b/docs/highdim/regularization_files/figure-html/user-effect-hist-1.png differ
diff --git a/docs/index.html b/docs/index.html
index 14cbc14..264f4af 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -202,23 +202,29 @@
   <a href="./inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -235,37 +241,37 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -282,31 +288,31 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -323,49 +329,49 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -411,8 +417,8 @@ <h1 class="unnumbered">Preface</h1>
 <p>This is the website for the <strong>Advanced Data Science</strong>.</p>
 <p>The website for <strong>Introduction to Data Science</strong> is <a href="http://rafalab.dfci.harvard.edu/dsbook-part-1/">here</a>.</p>
 <p>This book started out as part of the class notes used in the HarvardX <a href="https://www.edx.org/professional-certificate/harvardx-data-science">Data Science Series</a><a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>.</p>
-<p>A hardcopy version of the first edition of the book is available from <a href="https://www.routledge.com/Introduction-to-Data-Science-Data-Analysis-and-Prediction-Algorithms-with/Irizarry/p/book/9780367357986?utm_source=author&amp;utm_medium=shared_link&amp;utm_campaign=B043135_jm1_5ll_6rm_t081_1al_introductiontodatascienceauthorshare">CRC Press</a><a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>.</p>
-<p>A free PDF of the October 24, 2019 version of the book is available from <a href="https://leanpub.com/datasciencebook">Leanpub</a><a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>.</p>
+<p>A hardcopy version of the first edition of the book, which combined both Introduction and Advanced parts, is available from <a href="https://www.routledge.com/Introduction-to-Data-Science-Data-Analysis-and-Prediction-Algorithms-with/Irizarry/p/book/9780367357986?utm_source=author&amp;utm_medium=shared_link&amp;utm_campaign=B043135_jm1_5ll_6rm_t081_1al_introductiontodatascienceauthorshare">CRC Press</a><a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>.</p>
+<p>A free PDF of the October 24, 2019 version of the book, which combined both Introduction and Advanced parts, is available from <a href="https://leanpub.com/datasciencebook">Leanpub</a><a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>.</p>
 <p>The Quarto code used to generate the book is available on <a href="https://github.com/rafalab/dsbook-part-2">GitHub</a><a href="#fn4" class="footnote-ref" id="fnref4" role="doc-noteref"><sup>4</sup></a>. Note that, the graphical theme used for plots throughout the book can be recreated using the <code>ds_theme_set()</code> function from <strong>dslabs</strong> package.</p>
 <p>This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International <a href="https://creativecommons.org/licenses/by-nc-sa/4.0">CC BY-NC-SA 4.0</a>.</p>
 <p>We make announcements related to the book on Twitter. For updates follow <a href="https://twitter.com/rafalab">@rafalab</a>.</p>
@@ -421,7 +427,7 @@ <h1 class="unnumbered">Preface</h1>
 <h1 class="unnumbered">Acknowledgments</h1>
 <p>A special thanks to my tidyverse guru David Robinson and Amy Gill for dozens of comments, edits, and suggestions. Also, many thanks to Stephanie Hicks who twice served as a co-instructor in my data science classes and Yihui Xie who patiently put up with my many questions about bookdown. Thanks also to Héctor Corrada-Bravo, for advice on how to best teach machine learning. Thanks to Alyssa Frazee for helping create the homework problem that became the Recommendation Systems case study. Also, many thanks to Hadley Wickham, Mine Çetinkaya-Rundel, and Garrett Grolemund for making the Quarto code for their R for Data Science book open. Finally, thanks to Alex Nones for proofreading the manuscript during its various stages.</p>
 <p>This book was conceived during the teaching of several applied statistics courses, starting over fifteen years ago. The teaching assistants working with me throughout the years made important indirect contributions to this book. The latest iteration of this course is a HarvardX series coordinated by Heather Sternshein and Zofia Gajdos. We thank them for their contributions. We are also grateful to all the students whose questions and comments helped us improve the book. The courses were partially funded by NIH grant R25GM114818. We are very grateful to the National Institutes of Health for its support.</p>
-<p>A special thanks goes to all those who edited the book via GitHub pull requests or made suggestions by creating an <em>issue</em> or sending an email: <code>nickyfoto</code> (Huang Qiang), <code>desautm</code> (Marc-André Désautels), <code>michaschwab</code> (Michail Schwab), <code>alvarolarreategui</code> (Alvaro Larreategui), <code>jakevc</code> (Jake VanCampen), <code>omerta</code> (Guillermo Lengemann), <code>espinielli</code> (Enrico Spinielli), <code>asimumba</code>(Aaron Simumba), <code>braunschweig</code> (Maldewar), <code>gwierzchowski</code> (Grzegorz Wierzchowski), <code>technocrat</code> (Richard Careaga), <code>atzakas</code>, <code>defeit</code> (David Emerson Feit), <code>shiraamitchell</code> (Shira Mitchell), <code>Nathalie-S</code>, <code>andreashandel</code> (Andreas Handel), <code>berkowitze</code> (Elias Berkowitz), <code>Dean-Webb</code> (Dean Webber), <code>mohayusuf</code>, <code>jimrothstein</code>, <code>mPloenzke</code> (Matthew Ploenzke), <code>NicholasDowand</code> (Nicholas Dow), <code>kant</code> (Darío Hereñú), <code>debbieyuster</code> (Debbie Yuster), <code>tuanchauict</code> (Tuan Chau), <code>phzeller</code>, <code>BTJ01</code> (BradJ), <code>glsnow</code> (Greg Snow), <code>mberlanda</code> (Mauro Berlanda), <code>wfan9</code>, <code>larswestvang</code> (Lars Westvang), <code>jj999</code> (Jan Andrejkovic), <code>Kriegslustig</code> (Luca Nils Schmid), <code>odahhani</code>, <code>aidanhorn</code> (Aidan Horn), <code>atraxler</code> (Adrienne Traxler), <code>alvegorova</code>,<code>wycheong</code> (Won Young Cheong), <code>med-hat</code> (Medhat Khalil), <code>kengustafson</code>, <code>Yowza63</code>, <code>ryan-heslin</code> (Ryan Heslin), <code>raffaem</code>, <code>tim8west</code>, David D. Kane, El Mustapha El Abbassi, Vadim Zipunnikov, Anna Quaglieri, Chris Dong, Rick Schoenberg, and Isabella Grabski.</p>
+<p>A special thanks goes to all those who edited the book via GitHub pull requests or made suggestions by creating an <em>issue</em> or sending an email: <code>nickyfoto</code> (Huang Qiang), <code>desautm</code> (Marc-André Désautels), <code>michaschwab</code> (Michail Schwab), <code>alvarolarreategui</code> (Alvaro Larreategui), <code>jakevc</code> (Jake VanCampen), <code>omerta</code> (Guillermo Lengemann), <code>espinielli</code> (Enrico Spinielli), <code>asimumba</code>(Aaron Simumba), <code>braunschweig</code> (Maldewar), <code>gwierzchowski</code> (Grzegorz Wierzchowski), <code>technocrat</code> (Richard Careaga), <code>atzakas</code>, <code>defeit</code> (David Emerson Feit), <code>shiraamitchell</code> (Shira Mitchell), <code>Nathalie-S</code>, <code>andreashandel</code> (Andreas Handel), <code>berkowitze</code> (Elias Berkowitz), <code>Dean-Webb</code> (Dean Webber), <code>mohayusuf</code>, <code>jimrothstein</code>, <code>mPloenzke</code> (Matthew Ploenzke), <code>NicholasDowand</code> (Nicholas Dow), <code>kant</code> (Darío Hereñú), <code>debbieyuster</code> (Debbie Yuster), <code>tuanchauict</code> (Tuan Chau), <code>phzeller</code>, <code>BTJ01</code> (BradJ), <code>glsnow</code> (Greg Snow), <code>mberlanda</code> (Mauro Berlanda), <code>wfan9</code>, <code>larswestvang</code> (Lars Westvang), <code>jj999</code> (Jan Andrejkovic), <code>Kriegslustig</code> (Luca Nils Schmid), <code>odahhani</code>, <code>aidanhorn</code> (Aidan Horn), <code>atraxler</code> (Adrienne Traxler), <code>alvegorova</code>,<code>wycheong</code> (Won Young Cheong), <code>med-hat</code> (Medhat Khalil), <code>kengustafson</code>, <code>Yowza63</code>, <code>ryan-heslin</code> (Ryan Heslin), <code>raffaem</code>, <code>tim8west</code>, David D. Kane, El Mustapha El Abbassi, Vadim Zipunnikov, Anna Quaglieri, Chris Dong, Rick Schoenberg, Isabella Grabski, and Doug Snyder.</p>
 
 
 </section>
diff --git a/docs/inference/bayes.html b/docs/inference/bayes.html
index b674585..bad6233 100644
--- a/docs/inference/bayes.html
+++ b/docs/inference/bayes.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 11&nbsp; Bayesian statistics</title>
+<title>Advanced Data Science - 12&nbsp; Bayesian statistics</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../inference/intro-inference.html">Statistical inference</a></li><li class="breadcrumb-item"><a href="../inference/bayes.html"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../inference/intro-inference.html">Statistical inference</a></li><li class="breadcrumb-item"><a href="../inference/bayes.html"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -400,18 +406,18 @@
    
   <ul>
 <li>
-<a href="#bayes-theorem" id="toc-bayes-theorem" class="nav-link active" data-scroll-target="#bayes-theorem"><span class="header-section-number">11.1</span> Bayes theorem</a>
+<a href="#bayes-theorem" id="toc-bayes-theorem" class="nav-link active" data-scroll-target="#bayes-theorem"><span class="header-section-number">12.1</span> Bayes theorem</a>
   <ul class="collapse">
-<li><a href="#bayes-theorem-simulation" id="toc-bayes-theorem-simulation" class="nav-link" data-scroll-target="#bayes-theorem-simulation"><span class="header-section-number">11.1.1</span> Bayes theorem simulation</a></li>
+<li><a href="#bayes-theorem-simulation" id="toc-bayes-theorem-simulation" class="nav-link" data-scroll-target="#bayes-theorem-simulation"><span class="header-section-number">12.1.1</span> Bayes theorem simulation</a></li>
   </ul>
 </li>
-  <li><a href="#priors-posteriors-and-and-credible-intervals" id="toc-priors-posteriors-and-and-credible-intervals" class="nav-link" data-scroll-target="#priors-posteriors-and-and-credible-intervals"><span class="header-section-number">11.2</span> Priors, posteriors and and credible intervals</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">11.3</span> Exercises</a></li>
+  <li><a href="#priors-posteriors-and-and-credible-intervals" id="toc-priors-posteriors-and-and-credible-intervals" class="nav-link" data-scroll-target="#priors-posteriors-and-and-credible-intervals"><span class="header-section-number">12.2</span> Priors, posteriors and and credible intervals</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">12.3</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/inference/bayes.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
-<h1 class="title"><span id="sec-bayesian-statistics" class="quarto-section-identifier"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></h1>
+<h1 class="title"><span id="sec-bayesian-statistics" class="quarto-section-identifier"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></h1>
 </div>
 
 
@@ -424,8 +430,8 @@ <h1 class="title"><span id="sec-bayesian-statistics" class="quarto-section-ident
   </div>
   
 
-</header><p>In 2016 FiveThirtyEight showed this chart depicting distributions for the percent of the popular vote for each candidate:</p>
-<div class="cell" data-layout-align="center" data-hash="bayes_cache/html/fivethirtyeight-densities_6c1ed991121ba0014e3dd60b4e35a5bd">
+</header><p>In 2016, FiveThirtyEight showed this chart depicting distributions for the percent of the popular vote for each candidate:</p>
+<div class="cell" data-layout-align="center" data-hash="bayes_cache/html/fivethirtyeight-densities_dfc478efd00473a349c904f9961f500f">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="bayes_files/figure-html/fivethirtyeight-densities-1.png" class="img-fluid figure-img" style="width:80.0%"></p>
@@ -433,47 +439,47 @@ <h1 class="title"><span id="sec-bayesian-statistics" class="quarto-section-ident
 </div>
 </div>
 </div>
-<p>But what does this mean in the context of the theory we have covered in which these percentages are considered fixed. Furthermore, election forecasters make probabilistic statements such “Obama has a 90% chance of winning the election”. Note that in the context of an urn model, this would be equivalent to stating that the probability <span class="math inline">\(p&gt;0.5\)</span> is 90%. However, the urn model <span class="math inline">\(p\)</span> is a fixed parameter and it does not make sense to talk about probability. With Bayesian statistics, we model <span class="math inline">\(p\)</span> as random variable and thus a statement such as “90% chance of winning” is consistent with the mathematical approach. Forecasters also use models to describe variability at different levels. For example, sampling variability, pollster to pollster variability, day to day variability, and election to election variability. One of the most successful approaches used for this are hierarchical models, which can be explained in the context of Bayesian statistics.</p>
+<p>But what does this mean in the context of the theory we have previously covered, in which these percentages are considered fixed? Furthermore, election forecasters make probabilistic statements such “Obama has a 90% chance of winning the election.” Note that in the context of an urn model, this would be equivalent to stating that the probability <span class="math inline">\(p&gt;0.5\)</span> is 90%. However, the urn model <span class="math inline">\(p\)</span> is a fixed parameter and it does not make sense to talk about probability. With Bayesian statistics, we assume <span class="math inline">\(p\)</span> is random variable, and thus, a statement such as “90% chance of winning” is consistent with the mathematical approach. Forecasters also use models to describe variability at different levels. For example, sampling variability, pollster to pollster variability, day to day variability, and election to election variability. One of the most successful approaches used for this are hierarchical models, which can be explained in the context of Bayesian statistics.</p>
 <div class="callout callout-style-simple callout-note">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>The approach described in the previous chapters, where the parameters is thought of as fixed, is often referred to as <strong>frequentist</strong>.</p>
+<p>The approach described in the previous chapters, where the parameters are considered fixed, is often referred to as <strong>frequentist</strong>.</p>
 </div>
 </div>
 </div>
-<p>In this chapter we briefly describe Bayesian statistics. We use three cases studies, 1) interpreting diagnostic tests for a rare disease, 2) predicting the performance of an athelete, and 3) estimating the probability of Hillary Clinton winning in 2016 using pre-election poll data. For an in-depth treatment of this topic we recommend one of the following textbooks:</p>
+<p>In this chapter, we will briefly describe Bayesian statistics. We use three cases studies: 1) interpreting diagnostic tests for a rare disease, 2) predicting the performance of an athlete, and 3) estimating the probability of Hillary Clinton winning in 2016 using pre-election poll data. For an in-depth treatment of this topic, we recommend one of the following textbooks:</p>
 <ul>
 <li><p>Berger JO (1985). Statistical Decision Theory and Bayesian Analysis, 2nd edition. Springer-Verlag.</p></li>
 <li><p>Lee PM (1989). Bayesian Statistics: An Introduction. Oxford.</p></li>
 </ul>
-<section id="bayes-theorem" class="level2" data-number="11.1"><h2 data-number="11.1" class="anchored" data-anchor-id="bayes-theorem">
-<span class="header-section-number">11.1</span> Bayes theorem</h2>
-<p>We start by describing Bayes theorem. We do this using a hypothetical cystic fibrosis test as an example. Suppose a test for cystic fibrosis has an accuracy of 99%. We will use the following notation:</p>
+<section id="bayes-theorem" class="level2" data-number="12.1"><h2 data-number="12.1" class="anchored" data-anchor-id="bayes-theorem">
+<span class="header-section-number">12.1</span> Bayes theorem</h2>
+<p>We start by describing Bayes theorem, using a hypothetical cystic fibrosis test as an example. Suppose a test for cystic fibrosis has an accuracy of 99%. We will use the following notation:</p>
 <p><span class="math display">\[
 \mbox{Prob}(+ \mid D=1)=0.99, \mbox{Prob}(- \mid D=0)=0.99
 \]</span></p>
 <p>with <span class="math inline">\(+\)</span> meaning a positive test and <span class="math inline">\(D\)</span> representing if you actually have the disease (1) or not (0).</p>
-<p>Suppose we select a random person and they test positive. What is the probability that they have the disease? We write this as <span class="math inline">\(\mbox{Prob}(D=1 \mid +)?\)</span> The cystic fibrosis rate is 1 in 3,900 which implies that <span class="math inline">\(\mbox{Prob}(D=1)=0.00025\)</span>. To answer this question, we will use Bayes theorem, which in general tells us that:</p>
+<p>Imagine we select a random person and they test positive. What is the probability that they have the disease? We write this as <span class="math inline">\(\mbox{Prob}(D=1 \mid +)?\)</span> The cystic fibrosis rate is 1 in 3,900, which implies that <span class="math inline">\(\mbox{Prob}(D=1)=0.00025\)</span>. To answer this question, we will use Bayes theorem, which in general tells us that:</p>
 <p><span class="math display">\[
 \mbox{Pr}(A \mid B)  =  \frac{\mbox{Pr}(B \mid A)\mbox{Pr}(A)}{\mbox{Pr}(B)}
 \]</span></p>
-<p>This equation applied to our problem becomes:</p>
+<p>This equation, when applied to our problem, becomes:</p>
 <p><span class="math display">\[
 \begin{aligned}
 \mbox{Pr}(D=1 \mid +) &amp; =  \frac{ P(+ \mid D=1) \cdot P(D=1)} {\mbox{Pr}(+)} \\
 &amp; =  \frac{\mbox{Pr}(+ \mid D=1)\cdot P(D=1)} {\mbox{Pr}(+ \mid D=1) \cdot P(D=1) + \mbox{Pr}(+ \mid D=0) \mbox{Pr}( D=0)}
 \end{aligned}
 \]</span></p>
-<p>Plugging in the numbers we get:</p>
+<p>Plugging in the numbers, we get:</p>
 <p><span class="math display">\[
 \frac{0.99 \cdot 0.00025}{0.99 \cdot 0.00025 + 0.01 \cdot (.99975)}  =  0.02
 \]</span></p>
-<p>This says that despite the test having 0.99 accuracy, the probability of having the disease given a positive test is only 0.02. This may appear counter-intuitive to some, but the reason this is the case is because we have to factor in the very rare probability that a person, chosen at random, has the disease. To illustrate this, we run a Monte Carlo simulation.</p>
-<section id="bayes-theorem-simulation" class="level3" data-number="11.1.1"><h3 data-number="11.1.1" class="anchored" data-anchor-id="bayes-theorem-simulation">
-<span class="header-section-number">11.1.1</span> Bayes theorem simulation</h3>
+<p>According to the above, despite the test having 0.99 accuracy, the probability of having the disease given a positive test is only 0.02. This might seem counter-intuitive to some, but it ss because we must factor in the very rare probability that a randomly chosen person has the disease. To illustrate this, we run a Monte Carlo simulation.</p>
+<section id="bayes-theorem-simulation" class="level3" data-number="12.1.1"><h3 data-number="12.1.1" class="anchored" data-anchor-id="bayes-theorem-simulation">
+<span class="header-section-number">12.1.1</span> Bayes theorem simulation</h3>
 <p>The following simulation is meant to help you visualize Bayes theorem. We start by randomly selecting 100,000 people from a population in which the disease in question has a 1 in 4,000 prevalence.</p>
 <div class="cell" data-layout-align="center" data-hash="bayes_cache/html/unnamed-chunk-3_e1c9f0d1847703a0d8e30b6485a2ed25">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">prev</span> <span class="op">&lt;-</span> <span class="fl">0.00025</span></span>
@@ -490,7 +496,7 @@ <h1 class="title"><span id="sec-bayesian-statistics" class="quarto-section-ident
 <span><span class="va">N_H</span></span>
 <span><span class="co">#&gt; [1] 99977</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Also, there are many without the disease, which makes it more probable that we will see some false positives given that the test is not perfect. Now each person gets the test, which is correct 99% of the time:</p>
+<p>Also, there are many people without the disease, which makes it more probable that we will see some false positives given that the test is not perfect. Now, each person gets the test, which is correct 99% of the time:</p>
 <div class="cell" data-layout-align="center" data-hash="bayes_cache/html/unnamed-chunk-5_a8a9515166e82f5bcbec052ec7c5d66b">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">accuracy</span> <span class="op">&lt;-</span> <span class="fl">0.99</span></span>
 <span><span class="va">test</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/vector.html">vector</a></span><span class="op">(</span><span class="st">"character"</span>, <span class="va">N</span><span class="op">)</span></span>
@@ -499,7 +505,7 @@ <h1 class="title"><span id="sec-bayesian-statistics" class="quarto-section-ident
 <span><span class="va">test</span><span class="op">[</span><span class="va">outcome</span> <span class="op">==</span> <span class="st">"Healthy"</span><span class="op">]</span>  <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"-"</span>, <span class="st">"+"</span><span class="op">)</span>, <span class="va">N_H</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, </span>
 <span>                                    prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">accuracy</span>, <span class="fl">1</span> <span class="op">-</span> <span class="va">accuracy</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Because there are so many more controls than cases, even with a low false positive rate we get more controls than cases in the group that tested positive:</p>
+<p>Given that there are so many more controls than cases, even with a low false positive rate, we end up with more controls than cases in the group that tested positive:</p>
 <div class="cell" data-layout-align="center" data-hash="bayes_cache/html/unnamed-chunk-6_0186a3848aa51cb1920b2b3c0abcb4cb">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/table.html">table</a></span><span class="op">(</span><span class="va">outcome</span>, <span class="va">test</span><span class="op">)</span></span>
 <span><span class="co">#&gt;          test</span></span>
@@ -508,19 +514,20 @@ <h1 class="title"><span id="sec-bayesian-statistics" class="quarto-section-ident
 <span><span class="co">#&gt;   Healthy 99012   965</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>From this table, we see that the proportion of positive tests that have the disease is 23 out of 988. We can run this over and over again to see that, in fact, the probability converges to about 0.022.</p>
-</section></section><section id="priors-posteriors-and-and-credible-intervals" class="level2" data-number="11.2"><h2 data-number="11.2" class="anchored" data-anchor-id="priors-posteriors-and-and-credible-intervals">
-<span class="header-section-number">11.2</span> Priors, posteriors and and credible intervals</h2>
-<p>In the previous chapter we an estimate and margin of error for the difference in popular votes between Hillary Clinton and Donald Trump, which we denoted with <span class="math inline">\(\mu\)</span>. The estimate was between 2 and 3 percent and the confidence interval did not include 0. A forecaster would use this to predict Hillary Clinton would win the popular vote. But to make a probabilistic statement about winning the election, we need to use a Bayesian.</p>
-<p>We start the Bayesian approach by quantifying our knowledge <em>before</em> seeing any data. This is done using a probability distribution refereed to as a <em>prior</em>. For our example we could write:</p>
+</section></section><section id="priors-posteriors-and-and-credible-intervals" class="level2" data-number="12.2"><h2 data-number="12.2" class="anchored" data-anchor-id="priors-posteriors-and-and-credible-intervals">
+<span class="header-section-number">12.2</span> Priors, posteriors and and credible intervals</h2>
+<p>In the previous chapter, we computed an estimate and margin of error for the difference in popular votes between Hillary Clinton and Donald Trump. We denoted the parameter, the the difference in popular votes, with <span class="math inline">\(\mu\)</span>. The estimate was between 2 and 3 percent, and the confidence interval did not include 0. A forecaster would use this to predict Hillary Clinton would win the popular vote. But to make a probabilistic statement about winning the election, we need to use a Bayesian approach.</p>
+<p>We start the Bayesian approach by quantifying our knowledge <em>before</em> seeing any data. This is done using a probability distribution referred to as a <em>prior</em>. For our example, we could write:</p>
 <p><span class="math display">\[
 \mu \sim N(\theta, \tau)
 \]</span></p>
-<p>We can think of <span class="math inline">\(\theta\)</span> as our best guess for the popular vote difference had we not seen any polling data and we can think of <span class="math inline">\(\tau\)</span> as quantifying how certain we feel about this guess. Generally, if we have <em>expert knowledge</em> related to <span class="math inline">\(\mu\)</span>, we can try to quantify it with the prior disribution. In the case of election polls, experts use <em>fundamentals</em>, which include, for example, the state of the economy, to develop prior distributions. The data is used to update our initial guess or <em>prior belief</em>. This can be done mathematically if we define the distribution for the observed data, for any given <span class="math inline">\(\mu\)</span>. In our particular example we would write down a model the average of our polls, which is the same as before:</p>
+<p>We can think of <span class="math inline">\(\theta\)</span> as our best guess for the popular vote difference had we not seen any polling data, and we can think of <span class="math inline">\(\tau\)</span> as quantifying how certain we feel about this guess. Generally, if we have <em>expert knowledge</em> related to <span class="math inline">\(\mu\)</span>, we can try to quantify it with the prior distribution. In the case of election polls, experts use <em>fundamentals</em>, which include, for example, the state of the economy, to develop prior distributions.</p>
+<p>The data is used to update our initial guess or <em>prior belief</em>. This can be done mathematically if we define the distribution for the observed data for any given <span class="math inline">\(\mu\)</span>. In our particular example, we would write down a model the average of our polls, which is the same as before:</p>
 <p><span class="math display">\[
 \bar{X} \mid \mu \sim N(\mu, \sigma/\sqrt{N})
 \]</span></p>
-<p>As before, <span class="math inline">\(\sigma\)</span> describes randomness due to sampling and the pollster effects. In the Bayesian contexts, this is referred to as the sampling distribution. Note that we write the conditional <span class="math inline">\(\bar{X} \mid \mu\)</span> becuase <span class="math inline">\(\mu\)</span> is now considered a random variable.</p>
-<p>We do not show the derivations here, but we can now use Calculus and a version fo Bayes Theorem foto derive the distribution of <span class="math inline">\(\mu\)</span> conditional of the data, refered to as the posterior distribution. Specifcially we can show the <span class="math inline">\(\mu \mid \bar{X}\)</span> follows a normal distribution with expected value:</p>
+<p>As before, <span class="math inline">\(\sigma\)</span> describes randomness due to sampling and the pollster effects. In the Bayesian contexts, this is referred to as the sampling distribution. Note that we write the conditional <span class="math inline">\(\bar{X} \mid \mu\)</span> because <span class="math inline">\(\mu\)</span> is now considered a random variable.</p>
+<p>We do not show the derivations here, but we can now use calculus and a version of Bayes’ Theorem to derive the distribution of <span class="math inline">\(\mu\)</span> conditional of the data, referred to as the posterior distribution. Specifically, we can show the <span class="math inline">\(\mu \mid \bar{X}\)</span> follows a normal distribution with expected value:</p>
 <p><span class="math display">\[
 \begin{aligned}
 \mbox{E}(\mu \mid \bar{X}) &amp;= B \theta + (1-B) \bar{X}\\
@@ -532,8 +539,8 @@ <h1 class="title"><span id="sec-bayesian-statistics" class="quarto-section-ident
 \mbox{SE}(\mu \mid \bar{X})^2 = \frac{1}{1/\sigma^2+1/\tau^2}.
 \]</span></p>
 <p>Note that the expected value is a weighted average of our prior guess <span class="math inline">\(\theta\)</span> and the observed data <span class="math inline">\(\bar{X}\)</span>. The weight depends on how certain we are about our prior belief, quantified by <span class="math inline">\(\tau\)</span>, and the precision <span class="math inline">\(\sigma/N\)</span> of the summary of our observed data. This weighted average is sometimes referred to as <em>shrinking</em> because it <em>shrinks</em> estimates towards a prior value.</p>
-<p>These quantities useful for updating our beliefs. Specifically, we use the posterior distribution not only to compute the expected value of <span class="math inline">\(\mu\)</span> given the observed data, but for any probability <span class="math inline">\(\alpha\)</span> we can construct intervals, centered at our estimate and with <span class="math inline">\(\alpha\)</span> chance of ocurring.</p>
-<p>To compute a posterior distribution and construct a credible interval, we define a prior distribution with mean 0% and standard error 3.5% which can be interpreted as: before seing polling data, we don’t think any candidate has the advantage and a difference of up to 7% either way is possible. We compute the posterior distribution using the equations above:</p>
+<p>These quantities are useful for updating our beliefs. Specifically, we use the posterior distribution not only to compute the expected value of <span class="math inline">\(\mu\)</span> given the observed data, but also, for any probability <span class="math inline">\(\alpha\)</span>, we can construct intervals centered at our estimate and with <span class="math inline">\(\alpha\)</span> chance of occurring.</p>
+<p>To compute a posterior distribution and construct a credible interval, we define a prior distribution with mean 0% and standard error 3.5%, which can be interpreted as follows: before seeing polling data, we don’t think any candidate has the advantage, and a difference of up to 7% either way is possible. We compute the posterior distribution using the equations above:</p>
 <div class="cell" data-layout-align="center" data-hash="bayes_cache/html/unnamed-chunk-8_79b89a009f61c56c8bfa3eb8be683b79">
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">theta</span> <span class="op">&lt;-</span> <span class="fl">0</span></span>
 <span><span class="va">tau</span> <span class="op">&lt;-</span> <span class="fl">0.035</span></span>
@@ -549,20 +556,20 @@ <h1 class="title"><span id="sec-bayesian-statistics" class="quarto-section-ident
 <span><span class="va">posterior_se</span></span>
 <span><span class="co">#&gt; [1] 0.00615</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Because we know the posterior distribution in normal, we can consturct a credible interval like this:</p>
+<p>Since we know the posterior distribution is normal, we can construct a credible interval like this:</p>
 <div class="cell" data-layout-align="center" data-hash="bayes_cache/html/unnamed-chunk-9_f5f7e9a929c38850cefea5824bbd727c">
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">posterior_mean</span> <span class="op">+</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>, <span class="fl">1</span><span class="op">)</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">qnorm</a></span><span class="op">(</span><span class="fl">0.975</span><span class="op">)</span> <span class="op">*</span> <span class="va">posterior_se</span></span>
 <span><span class="co">#&gt; [1] 0.0160 0.0401</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Furthermore, we can now make the probabilitic statement we could not make with the frequentists approach by computing the posterior probability of Hillary winning the popular vote. Specifically, <span class="math inline">\(\mbox{Pr}(\mu&gt;0 \mid \bar{X})\)</span> can be computed like this:</p>
+<p>Furthermore, we can now make the probabilistic statement that we could not make with the frequentists approach by computing the posterior probability of Hillary winning the popular vote. Specifically, <span class="math inline">\(\mbox{Pr}(\mu&gt;0 \mid \bar{X})\)</span> can be computed as follows:</p>
 <div class="cell" data-layout-align="center" data-hash="bayes_cache/html/unnamed-chunk-10_4ae553e8f3b9e3c1b36ba03c81dd5573">
 <div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fl">1</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="fl">0</span>, <span class="va">posterior_mean</span>, <span class="va">posterior_se</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This says we are 100% sure Clinton will win the popular vote, which seems too overconfident. Also, it is not in agreement with FiveThirtyEight’s 81.4%. What explains this difference? There is a level of uncertainty that we are not yet describing, and we will get back to that in Chapter <a href="hierarchical-models.html"><span>Chapter&nbsp;12</span></a>.</p>
-</section><section id="exercises" class="level2" data-number="11.3"><h2 data-number="11.3" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">11.3</span> Exercises</h2>
-<p>1. In 1999, in England, Sally Clark<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> was found guilty of the murder of two of her sons. Both infants were found dead in the morning, one in 1996 and another in 1998. In both cases, she claimed the cause of death was sudden infant death syndrome (SIDS). No evidence of physical harm was found on the two infants so the main piece of evidence against her was the testimony of Professor Sir Roy Meadow, who testified that the chances of two infants dying of SIDS was 1 in 73 million. He arrived at this figure by finding that the rate of SIDS was 1 in 8,500 and then calculating that the chance of two SIDS cases was 8,500 <span class="math inline">\(\times\)</span> 8,500 <span class="math inline">\(\approx\)</span> 73 million. Which of the following do you agree with?</p>
+<p>According to the above, we are 100% sure Clinton will win the popular vote, which seems too overconfident. Additionally, it is not in agreement with FiveThirtyEight’s 81.4%. What explains this difference? There is a level of uncertainty that we are not yet describing, and we will return to that in <a href="hierarchical-models.html"><span>Chapter&nbsp;13</span></a>.</p>
+</section><section id="exercises" class="level2" data-number="12.3"><h2 data-number="12.3" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">12.3</span> Exercises</h2>
+<p>1. In 1999, in England, Sally Clark<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> was found guilty of the murder of two of her sons. Both infants were found dead in the morning, one in 1996 and another in 1998. In both cases, she claimed the cause of death was sudden infant death syndrome (SIDS). No evidence of physical harm was found on the two infants, so the main piece of evidence against her was the testimony of Professor Sir Roy Meadow, who testified that the chances of two infants dying of SIDS was 1 in 73 million. He arrived at this figure by finding that the rate of SIDS was 1 in 8,500, and then calculating that the chance of two SIDS cases was 8,500 <span class="math inline">\(\times\)</span> 8,500 <span class="math inline">\(\approx\)</span> 73 million. Which of the following do you agree with?</p>
 <ol type="a">
 <li>Sir Meadow assumed that the probability of the second son being affected by SIDS was independent of the first son being affected, thereby ignoring possible genetic causes. If genetics plays a role then: <span class="math inline">\(\mbox{Pr}(\mbox{second case of SIDS} \mid \mbox{first case of SIDS}) &gt; \mbox{P}r(\mbox{first case of SIDS})\)</span>.</li>
 <li>Nothing. The multiplication rule always applies in this way: <span class="math inline">\(\mbox{Pr}(A \mbox{ and } B) =\mbox{Pr}(A)\mbox{Pr}(B)\)</span>
@@ -570,13 +577,13 @@ <h1 class="title"><span id="sec-bayesian-statistics" class="quarto-section-ident
 <li>Sir Meadow is an expert and we should trust his calculations.</li>
 <li>Numbers don’t lie.</li>
 </ol>
-<p>2. Let’s assume that there is in fact a genetic component to SIDS and the probability of <span class="math inline">\(\mbox{Pr}(\mbox{second case of SIDS} \mid \mbox{first case of SIDS}) = 1/100\)</span>, is much higher than 1 in 8,500. What is the probability of both of her sons dying of SIDS?</p>
-<p>3. Many press reports stated that the expert claimed the probability of Sally Clark being innocent as 1 in 73 million. Perhaps the jury and judge also interpreted the testimony this way. This probability can be written as the probability of <em>a mother is a son-murdering psychopath</em> given that <em>two of her children are found dead with no evidence of physical harm</em>. According to Bayes’ rule, what is this?</p>
+<p>2. Let’s assume that there is, in fact, a genetic component to SIDS and the probability of <span class="math inline">\(\mbox{Pr}(\mbox{second case of SIDS} \mid \mbox{first case of SIDS}) = 1/100\)</span>, is much higher than 1 in 8,500. What is the probability of both of her sons dying of SIDS?</p>
+<p>3. Many press reports stated that the expert claimed the probability of Sally Clark being innocent was 1 in 73 million. Perhaps the jury and judge also interpreted the testimony this way. This probability can be written as the probability of <em>a mother is a son-murdering psychopath</em> given that <em>two of her children are found dead with no evidence of physical harm</em>. According to Bayes’ rule, what is this?</p>
 <p>4. Assume that the chance of a son-murdering psychopath finding a way to kill her children, without leaving evidence of physical harm, is:</p>
 <p><span class="math display">\[
 \mbox{Pr}(A \mid B) = 0.50
 \]</span></p>
-<p>with A = two of her children are found dead with no evidence of physical harm and B = a mother is a son-murdering psychopath = 0.50. Assume that the rate of son-murdering psychopaths mothers is 1 in 1,000,000. According to Bayes’ theorem, what is the probability of <span class="math inline">\(\mbox{Pr}(B \mid A)\)</span> ?</p>
+<p>with A = two of her children are found dead with no evidence of physical harm, and B = a mother is a son-murdering psychopath = 0.50. Assume that the rate of son-murdering psychopaths mothers is 1 in 1,000,000. According to Bayes’ Theorem, what is the probability of <span class="math inline">\(\mbox{Pr}(B \mid A)\)</span> ?</p>
 <p>5/. After Sally Clark was found guilty, the Royal Statistical Society issued a statement saying that there was “no statistical basis” for the expert’s claim. They expressed concern at the “misuse of statistics in the courts”. Eventually, Sally Clark was acquitted in June 2003. What did the expert miss?</p>
 <ol type="a">
 <li>He made an arithmetic error.</li>
@@ -584,7 +591,8 @@ <h1 class="title"><span id="sec-bayesian-statistics" class="quarto-section-ident
 <li>He mixed up the numerator and denominator of Bayes’ rule.</li>
 <li>He did not use R.</li>
 </ol>
-<p>6. Florida is one of the most closely watched states in the U.S. election because it has many electoral votes, and the election is generally close, and Florida tends to be a swing state that can vote either way. Create the following table with the polls taken during the last two weeks:</p>
+<p>6. Florida is one of the most closely watched states in U.S. elections because it has many electoral votes. In past elections, Florida was a swing state where both Republicans and Democrats won implying it could affect a close elections.</p>
+<p>Create the following table with the polls taken during the last two weeks:</p>
 <div class="cell" data-layout-align="center" data-hash="bayes_cache/html/unnamed-chunk-11_05d60ad88238c98eebdf69794bea7437">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
 <span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
@@ -593,17 +601,17 @@ <h1 class="title"><span id="sec-bayesian-statistics" class="quarto-section-ident
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>spread <span class="op">=</span> <span class="va">rawpoll_clinton</span><span class="op">/</span><span class="fl">100</span> <span class="op">-</span> <span class="va">rawpoll_trump</span><span class="op">/</span><span class="fl">100</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Take the average spread of these polls. The CLT tells us this average is approximately normal. Calculate an average and provide an estimate of the standard error. Save your results in an object called <code>results</code>.</p>
-<p>7. Now assume a Bayesian model that sets the prior distribution for Florida’s election night spread <span class="math inline">\(\mu\)</span> to be Normal with expected value <span class="math inline">\(\theta\)</span> and standard deviation <span class="math inline">\(\tau\)</span>. What are the interpretations of <span class="math inline">\(\theta\)</span> and <span class="math inline">\(\tau\)</span>?</p>
+<p>7. Now assume a Bayesian model that sets the prior distribution for Florida’s election night spread <span class="math inline">\(\mu\)</span> to follow a normal distribution with expected value <span class="math inline">\(\theta\)</span> and standard deviation <span class="math inline">\(\tau\)</span>. What are the interpretations of <span class="math inline">\(\theta\)</span> and <span class="math inline">\(\tau\)</span>?</p>
 <ol type="a">
 <li>
 <span class="math inline">\(\theta\)</span> and <span class="math inline">\(\tau\)</span> are arbitrary numbers that let us make probability statements about <span class="math inline">\(\mu\)</span>.</li>
 <li>
-<span class="math inline">\(\theta\)</span> and <span class="math inline">\(\tau\)</span> summarize what we would predict for Florida before seeing any polls. Based on past elections, we would set <span class="math inline">\(\mu\)</span> close to 0 because both Republicans and Democrats have won, and <span class="math inline">\(\tau\)</span> at about <span class="math inline">\(0.02\)</span>, because these elections tend to be close.</li>
+<span class="math inline">\(\theta\)</span> and <span class="math inline">\(\tau\)</span> summarize what we would predict for Florida before seeing any polls. Based on past elections, we would set <span class="math inline">\(\mu\)</span> close to 0, because both Republicans and Democrats have won, and <span class="math inline">\(\tau\)</span> at about <span class="math inline">\(0.02\)</span>, because these elections tend to be close.</li>
 <li>
 <span class="math inline">\(\theta\)</span> and <span class="math inline">\(\tau\)</span> summarize what we want to be true. We therefore set <span class="math inline">\(\theta\)</span> at <span class="math inline">\(0.10\)</span> and <span class="math inline">\(\tau\)</span> at <span class="math inline">\(0.01\)</span>.</li>
 <li>The choice of prior has no effect on Bayesian analysis.</li>
 </ol>
-<p>8. The CLT tells us that our estimate of the spread <span class="math inline">\(\hat{\mu}\)</span> has normal distribution with expected value <span class="math inline">\(\mu\)</span> and standard deviation <span class="math inline">\(\sigma\)</span> calculated in problem 6. Use the formulas we showed for the posterior distribution to calculate the expected value of the posterior distribution if we set <span class="math inline">\(\theta = 0\)</span> and <span class="math inline">\(\tau = 0.01\)</span>.</p>
+<p>8. The CLT tells us that our estimate of the spread <span class="math inline">\(\hat{\mu}\)</span> has normal distribution with expected value <span class="math inline">\(\mu\)</span> and standard deviation <span class="math inline">\(\sigma\)</span> calculated in exercise 6. Use the formulas we provided for the posterior distribution to calculate the expected value of the posterior distribution if we set <span class="math inline">\(\theta = 0\)</span> and <span class="math inline">\(\tau = 0.01\)</span>.</p>
 <p>9. Now compute the standard deviation of the posterior distribution.</p>
 <p>10. Using the fact that the posterior distribution is normal, create an interval that has a 95% probability of occurring centered at the posterior expected value. Note that we call these credible intervals.</p>
 <p>11. According to this analysis, what was the probability that Trump wins Florida?</p>
@@ -847,12 +855,12 @@ <h1 class="title"><span id="sec-bayesian-statistics" class="quarto-section-ident
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../inference/models.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../inference/hierarchical-models.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/inference/bootstrap.html b/docs/inference/bootstrap.html
new file mode 100644
index 0000000..a72c8c8
--- /dev/null
+++ b/docs/inference/bootstrap.html
@@ -0,0 +1,779 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.3.353">
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+<meta name="author" content="Rafael A. Irizarry">
+<title>Advanced Data Science - 10&nbsp; Bootstrap</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../inference/models.html" rel="next">
+<link href="../inference/hypothesis-testing.html" rel="prev">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light"><script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 20,
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit"
+  }
+}</script><script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script><script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+</head>
+<body class="nav-sidebar floating">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top"><nav class="quarto-secondary-nav"><div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../inference/intro-inference.html">Statistical inference</a></li><li class="breadcrumb-item"><a href="../inference/bootstrap.html"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></a></li></ol></nav>
+      <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+      </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav></header><!-- content --><div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal sidebar-navigation floating overflow-auto"><div class="pt-lg-2 mt-2 text-left sidebar-header">
+    <div class="sidebar-title mb-0 py-0">
+      <a href="../">Advanced Data Science</a> 
+        <div class="sidebar-tools-main">
+    <a href="https://github.com/rafalab/dsbook-part-2" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+  <a href="" class="quarto-reader-toggle quarto-navigation-tool px-1" onclick="window.quartoToggleReader(); return false;" title="Toggle reader mode">
+  <div class="quarto-reader-toggle-btn">
+  <i class="bi"></i>
+  </div>
+</a>
+</div>
+    </div>
+      </div>
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+<li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Preface</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../intro.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Introduction</span></a>
+  </div>
+</li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../summaries/intro-summaries.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Summary statistics</span></a>
+          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="false" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 ">
+<li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../summaries/distributions.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Distributions</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../summaries/robust-summaries.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">Robust summaries</span></span></a>
+  </div>
+</li>
+      </ul>
+</li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../prob/intro-to-prob.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Probability</span></a>
+          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="false" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 ">
+<li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../prob/discrete-probability.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">Discrete probability</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../prob/continuous-probability.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">Continuous probability</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../prob/random-variables-sampling-models-clt.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Random variables</span></span></a>
+  </div>
+</li>
+      </ul>
+</li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../inference/intro-inference.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Statistical inference</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth1 show">
+<li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/parameters-estimates.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">Parameters and Estimates</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/clt.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">Central Limit Theorem</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/confidence-intervals.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Confidence intervals</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+  </div>
+</li>
+      </ul>
+</li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../linear-models/intro-to-linear-models.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Linear Models</span></a>
+          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" aria-expanded="false" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-4" class="collapse list-unstyled sidebar-section depth1 ">
+<li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+  </div>
+</li>
+      </ul>
+</li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../highdim/intro-highdim.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">High dimensional data</span></a>
+          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" aria-expanded="false" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-5" class="collapse list-unstyled sidebar-section depth1 ">
+<li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
+  </div>
+</li>
+      </ul>
+</li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../ml/intro-ml.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Machine Learning</span></a>
+          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" aria-expanded="false" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-6" class="collapse list-unstyled sidebar-section depth1 ">
+<li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+  </div>
+</li>
+      </ul>
+</li>
+    </ul>
+</div>
+</nav><div id="quarto-sidebar-glass" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+<li><a href="#example-median-income" id="toc-example-median-income" class="nav-link active" data-scroll-target="#example-median-income"><span class="header-section-number">10.1</span> Example: median income</a></li>
+  <li><a href="#confidence-intervals-for-the-median" id="toc-confidence-intervals-for-the-median" class="nav-link" data-scroll-target="#confidence-intervals-for-the-median"><span class="header-section-number">10.2</span> Confidence intervals for the median</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">10.3</span> Exercises</a></li>
+  </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/inference/bootstrap.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
+<h1 class="title"><span id="sec-bootstrap" class="quarto-section-identifier"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+</header><p>CLT provides an useful approach to building confidence intervals and performing hypothesis testing. However, it does not always apply. Here we provide a short introduction to an alternative approach to estimating the distribution of an estimate that does not rely on CLT.</p>
+<section id="example-median-income" class="level2" data-number="10.1"><h2 data-number="10.1" class="anchored" data-anchor-id="example-median-income">
+<span class="header-section-number">10.1</span> Example: median income</h2>
+<p>Suppose the income distribution of your population is as follows:</p>
+<div class="cell" data-layout-align="center" data-hash="bootstrap_cache/html/income-distribution_944b46d2c66c44e322850e009e01aa31">
+<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1995</span><span class="op">)</span></span>
+<span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">10</span><span class="op">^</span><span class="fl">6</span></span>
+<span><span class="va">income</span> <span class="op">&lt;-</span> <span class="fl">10</span><span class="op">^</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">n</span>, <span class="fu"><a href="https://rdrr.io/r/base/Log.html">log10</a></span><span class="op">(</span><span class="fl">45000</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/Log.html">log10</a></span><span class="op">(</span><span class="fl">3</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">income</span><span class="op">/</span><span class="fl">10</span><span class="op">^</span><span class="fl">3</span>, nclass <span class="op">=</span> <span class="fl">1000</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output-display">
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure"><p><img src="bootstrap_files/figure-html/income-distribution-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>The population median is:</p>
+<div class="cell" data-layout-align="center" data-hash="bootstrap_cache/html/unnamed-chunk-1_906b2ab5028f3abd868462c3b28d972d">
+<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">m</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/median.html">median</a></span><span class="op">(</span><span class="va">income</span><span class="op">)</span></span>
+<span><span class="va">m</span></span>
+<span><span class="co">#&gt; [1] 44939</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Suppose we don’t have access to the entire population, but want to estimate the median <span class="math inline">\(m\)</span>. We take a sample of 100 and estimate the population median <span class="math inline">\(m\)</span> with the sample median <span class="math inline">\(M\)</span>:</p>
+<div class="cell" data-layout-align="center" data-hash="bootstrap_cache/html/unnamed-chunk-2_8c98714053115760e9e4240e9c7b2b10">
+<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">100</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">income</span>, <span class="va">N</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/stats/median.html">median</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 38461</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+</section><section id="confidence-intervals-for-the-median" class="level2" data-number="10.2"><h2 data-number="10.2" class="anchored" data-anchor-id="confidence-intervals-for-the-median">
+<span class="header-section-number">10.2</span> Confidence intervals for the median</h2>
+<p>Can we construct a confidence interval? What is the distribution of <span class="math inline">\(M\)</span> ?</p>
+<p>Because we are simulating the data, we can use a Monte Carlo simulation to learn the distribution of <span class="math inline">\(M\)</span>.</p>
+<div class="cell" data-layout-align="center" data-hash="bootstrap_cache/html/unnamed-chunk-3_21494de28b14fbdedeade8da63b9f04e">
+<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">gridExtra</span><span class="op">)</span></span>
+<span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10</span><span class="op">^</span><span class="fl">4</span></span>
+<span><span class="va">m</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="op">{</span></span>
+<span>  <span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">income</span>, <span class="va">N</span><span class="op">)</span></span>
+<span>  <span class="fu"><a href="https://rdrr.io/r/stats/median.html">median</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
+<span><span class="op">}</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">m</span>, nclass <span class="op">=</span> <span class="fl">30</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/stats/qqnorm.html">qqnorm</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/scale.html">scale</a></span><span class="op">(</span><span class="va">m</span><span class="op">)</span><span class="op">)</span>; <span class="fu"><a href="https://rdrr.io/r/graphics/abline.html">abline</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<div class="cell" data-layout-align="center" data-hash="bootstrap_cache/html/median-is-normal_3e9946ce8e6beb5bbcadfea3e7e626a2">
+<div class="cell-output-display">
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure"><p><img src="bootstrap_files/figure-html/median-is-normal-1.png" class="img-fluid figure-img" style="width:100.0%"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>If we know this distribution, we can construct a confidence interval. The problem here is that, as we have already described, in practice we do not have access to the distribution. In the past, we have used the Central Limit Theorem, but the CLT we studied applies to averages and here we are interested in the median. We can see that the 95% confidence interval based on CLT</p>
+<div class="cell" data-layout-align="center" data-hash="bootstrap_cache/html/unnamed-chunk-4_68712f499cb6e78577eb63e5dfa95632">
+<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/median.html">median</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">+</span> <span class="fl">1.96</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">N</span><span class="op">)</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>, <span class="fl">1</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 21018 55905</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>is quite different from the confidence interval we would generate if we know the actual distribution of <span class="math inline">\(M\)</span>:</p>
+<div class="cell" data-layout-align="center" data-hash="bootstrap_cache/html/unnamed-chunk-5_8d8b4a50971f400b04804243756af781">
+<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/quantile.html">quantile</a></span><span class="op">(</span><span class="va">m</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0.025</span>, <span class="fl">0.975</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="co">#&gt;  2.5% 97.5% </span></span>
+<span><span class="co">#&gt; 34438 59050</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>The bootstrap permits us to approximate a Monte Carlo simulation without access to the entire distribution. The general idea is relatively simple. We act as if the observed sample is the population. We then sample (with replacement) datasets, of the same sample size as the original dataset. Then we compute the summary statistic, in this case the median, on these <em>bootstrap samples</em>.</p>
+<p>Theory tells us that, in many situations, the distribution of the statistics obtained with bootstrap samples approximate the distribution of our actual statistic. This is how we construct bootstrap samples and an approximate distribution:</p>
+<div class="cell" data-layout-align="center" data-hash="bootstrap_cache/html/unnamed-chunk-6_042f126f97e959768c37bfa29221360f">
+<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10</span><span class="op">^</span><span class="fl">4</span></span>
+<span><span class="va">m_star</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="op">{</span></span>
+<span>  <span class="va">x_star</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
+<span>  <span class="fu"><a href="https://rdrr.io/r/stats/median.html">median</a></span><span class="op">(</span><span class="va">x_star</span><span class="op">)</span></span>
+<span><span class="op">}</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Note a confidence interval constructed with the bootstrap is much closer to one constructed with the theoretical distribution:</p>
+<div class="cell" data-layout-align="center" data-hash="bootstrap_cache/html/unnamed-chunk-7_4b73bc0b1464eb765f47d5c46de19240">
+<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/quantile.html">quantile</a></span><span class="op">(</span><span class="va">m_star</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0.025</span>, <span class="fl">0.975</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="co">#&gt;  2.5% 97.5% </span></span>
+<span><span class="co">#&gt; 30253 56909</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>For more on the Bootstrap, including corrections one can apply to improve these confidence intervals, please consult the book <em>An introduction to the bootstrap</em> by Efron, B., &amp; Tibshirani, R. J.</p>
+</section><section id="exercises" class="level2" data-number="10.3"><h2 data-number="10.3" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">10.3</span> Exercises</h2>
+<p>1. Generate a random dataset like this:</p>
+<div class="cell" data-layout-align="center" data-hash="bootstrap_cache/html/unnamed-chunk-8_533f10da02a6936da55746448abbffe1">
+<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="fl">100</span>, <span class="fl">0</span>, <span class="fl">1</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Estimate the 75th quantile, which we know is:</p>
+<div class="cell" data-layout-align="center" data-hash="bootstrap_cache/html/unnamed-chunk-9_d92587942d5bcc89fc049faeacd5b39f">
+<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">qnorm</a></span><span class="op">(</span><span class="fl">0.75</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>with the sample quantile:</p>
+<div class="cell" data-layout-align="center" data-hash="bootstrap_cache/html/unnamed-chunk-10_4473673b92453883cf7d3d0309707589">
+<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/quantile.html">quantile</a></span><span class="op">(</span><span class="va">y</span>, <span class="fl">0.75</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Run a Monte Carlo simulation to learn the expected value and standard error of this random variable.</p>
+<p>2. In practice, we can’t run a Monte Carlo simulation because we don’t know if <code>rnorm</code> is being used to simulate the data. Use the bootstrap to estimate the standard error using just the initial sample <code>y</code>. Use 10 bootstrap samples.</p>
+<p>3. Redo exercise 12, but with 10,000 bootstrap samples.</p>
+
+
+</section></main><!-- /main --><script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button', {
+    text: function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+    }
+  });
+  clipboard.on('success', function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  });
+  function tippyHover(el, contentFn) {
+    const config = {
+      allowHTML: true,
+      content: contentFn,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start'
+    };
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      return note.innerHTML;
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script><nav class="page-navigation"><div class="nav-page nav-page-previous">
+      <a href="../inference/hypothesis-testing.html" class="pagination-link">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../inference/models.html" class="pagination-link">
+        <span class="nav-page-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+<footer class="footer"><div class="nav-footer">
+    <div class="nav-footer-left">Advanced Data Science was written by Rafael A. Irizarry</div>   
+    <div class="nav-footer-center">
+      &nbsp;
+    </div>
+    <div class="nav-footer-right">This book was built with <a href="https://quarto.org/">Quarto</a>.</div>
+  </div>
+</footer>
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/inference/clt.html b/docs/inference/clt.html
index 189d421..83ebf65 100644
--- a/docs/inference/clt.html
+++ b/docs/inference/clt.html
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -401,7 +407,7 @@
   <ul>
 <li><a href="#a-monte-carlo-simulation" id="toc-a-monte-carlo-simulation" class="nav-link active" data-scroll-target="#a-monte-carlo-simulation"><span class="header-section-number">7.1</span> A Monte Carlo simulation</a></li>
   <li><a href="#the-spread" id="toc-the-spread" class="nav-link" data-scroll-target="#the-spread"><span class="header-section-number">7.2</span> The spread</a></li>
-  <li><a href="#bias-why-not-run-a-very-large-poll" id="toc-bias-why-not-run-a-very-large-poll" class="nav-link" data-scroll-target="#bias-why-not-run-a-very-large-poll"><span class="header-section-number">7.3</span> Bias: why not run a very large poll?</a></li>
+  <li><a href="#bias-why-not-run-a-very-large-poll" id="toc-bias-why-not-run-a-very-large-poll" class="nav-link" data-scroll-target="#bias-why-not-run-a-very-large-poll"><span class="header-section-number">7.3</span> Bias: Why not run a very large poll?</a></li>
   <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">7.4</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/inference/clt.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
@@ -420,38 +426,38 @@ <h1 class="title"><span id="sec-clt" class="quarto-section-identifier"><span cla
   </div>
   
 
-</header><p>The CLT tells us that the distribution function for a sum of draws is approximately normal. We also learned that dividing a normally distributed random variable by a constant is also a normally distributed variable. This implies that the distribution of <span class="math inline">\(\bar{X}\)</span> is approximately normal.</p>
+</header><p>The CLT tells us that the distribution function for a sum of draws is approximately normal. Additionally, we have learned that dividing a normally distributed random variable by a constant results in another normally distributed variable. This implies that the distribution of <span class="math inline">\(\bar{X}\)</span> is approximately normal.</p>
 <p>In summary, we have that <span class="math inline">\(\bar{X}\)</span> has an approximately normal distribution with expected value <span class="math inline">\(p\)</span> and standard error <span class="math inline">\(\sqrt{p(1-p)/N}\)</span>.</p>
-<p>Now how does this help us? Suppose we want to know what is the probability that we are within 1% from <span class="math inline">\(p\)</span>. We are basically asking what is</p>
+<p>Now how does this help us? Suppose we want to know what is the probability that we are within 1% from <span class="math inline">\(p\)</span>. We are basically asking what is:</p>
 <p><span class="math display">\[
 \mbox{Pr}(| \bar{X} - p| \leq .01)
 \]</span> which is the same as:</p>
 <p><span class="math display">\[
 \mbox{Pr}(\bar{X}\leq p + .01) - \mbox{Pr}(\bar{X} \leq p - .01)
 \]</span></p>
-<p>Can we answer this question? We can use the mathematical trick we learned in the previous chapter. Subtract the expected value and divide by the standard error to get a standard normal random variable, call it <span class="math inline">\(Z\)</span>, on the left. Since <span class="math inline">\(p\)</span> is the expected value and <span class="math inline">\(\mbox{SE}(\bar{X}) = \sqrt{p(1-p)/N}\)</span> is the standard error we get:</p>
+<p>Can we answer this question? We can use the mathematical trick we learned in the previous section: subtract the expected value and divide by the standard error to obtain a standard normal random variable, which we’ll denote as <span class="math inline">\(Z\)</span>, on the left. Since <span class="math inline">\(p\)</span> is the expected value and <span class="math inline">\(\mbox{SE}(\bar{X}) = \sqrt{p(1-p)/N}\)</span> is the standard error, we get:</p>
 <p><span class="math display">\[
 \mbox{Pr}\left(Z \leq \frac{ \,.01} {\mbox{SE}(\bar{X})} \right) -
 \mbox{Pr}\left(Z \leq - \frac{ \,.01} {\mbox{SE}(\bar{X})} \right)
 \]</span></p>
-<p>One problem we have is that since we don’t know <span class="math inline">\(p\)</span>, we don’t know <span class="math inline">\(\mbox{SE}(\bar{X})\)</span>. But it turns out that the CLT still works if we estimate the standard error by using <span class="math inline">\(\bar{X}\)</span> in place of <span class="math inline">\(p\)</span>. We say that we <em>plug-in</em> the estimate. Our estimate of the standard error is therefore:</p>
+<p>One problem we have is that since we don’t know <span class="math inline">\(p\)</span>, we don’t know <span class="math inline">\(\mbox{SE}(\bar{X})\)</span>. However, it turns out that the CLT still works if we estimate the standard error by using <span class="math inline">\(\bar{X}\)</span> in place of <span class="math inline">\(p\)</span>. We say that we <em>plug-in</em> the estimate. Our estimate of the standard error is therefore:</p>
 <p><span class="math display">\[
 \hat{\mbox{SE}}(\bar{X})=\sqrt{\bar{X}(1-\bar{X})/N}
 \]</span> In statistics textbooks, we use a little hat to denote estimates. The estimate can be constructed using the observed data and <span class="math inline">\(N\)</span>.</p>
-<p>Now we continue with our calculation, but dividing by <span class="math inline">\(\hat{\mbox{SE}}(\bar{X})=\sqrt{\bar{X}(1-\bar{X})/N})\)</span> instead. In our first sample we had 12 blue and 13 red so <span class="math inline">\(\bar{X} = 0.48\)</span> and our estimate of standard error is:</p>
+<p>Now we continue with our calculation, but dividing by <span class="math inline">\(\hat{\mbox{SE}}(\bar{X})=\sqrt{\bar{X}(1-\bar{X})/N})\)</span> instead. In our first sample, we had 12 blue and 13 red, so <span class="math inline">\(\bar{X} = 0.48\)</span> and our estimate of standard error is:</p>
 <div class="cell" data-layout-align="center" data-hash="clt_cache/html/unnamed-chunk-1_0798256204e609e9a67ed53381213f67">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x_hat</span> <span class="op">&lt;-</span> <span class="fl">0.48</span></span>
 <span><span class="va">se</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">x_hat</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span><span class="op">-</span><span class="va">x_hat</span><span class="op">)</span><span class="op">/</span><span class="fl">25</span><span class="op">)</span></span>
 <span><span class="va">se</span></span>
 <span><span class="co">#&gt; [1] 0.0999</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>And now we can answer the question of the probability of being close to <span class="math inline">\(p\)</span>. The answer is:</p>
+<p>Now, we can answer the question of the probability of being close to <span class="math inline">\(p\)</span>. The answer is:</p>
 <div class="cell" data-layout-align="center" data-hash="clt_cache/html/unnamed-chunk-2_466dfb00567f7f3985c194e3052a5716">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="fl">0.01</span><span class="op">/</span><span class="va">se</span><span class="op">)</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="op">-</span><span class="fl">0.01</span><span class="op">/</span><span class="va">se</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.0797</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Therefore, there is a small chance that we will be close. A poll of only <span class="math inline">\(N=25\)</span> people is not really very useful, at least not for a close election.</p>
-<p>Earlier we mentioned the <em>margin of error</em>. Now we can define it because it is simply two times the standard error, which we can now estimate. In our case it is:</p>
+<p>Earlier, we mentioned the <em>margin of error</em>. Now, we can define it simply as two times the standard error, which we can now estimate. In our case it is:</p>
 <div class="cell" data-layout-align="center" data-hash="clt_cache/html/unnamed-chunk-3_dddef192b529ca54c6c8b1a2aa2e41bb">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fl">1.96</span><span class="op">*</span><span class="va">se</span></span>
 <span><span class="co">#&gt; [1] 0.196</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -470,22 +476,22 @@ <h1 class="title"><span id="sec-clt" class="quarto-section-identifier"><span cla
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="fl">1.96</span><span class="op">)</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1.96</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.95</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Hence, there is a 95% probability that <span class="math inline">\(\bar{X}\)</span> will be within <span class="math inline">\(1.96\times \hat{SE}(\bar{X})\)</span>, in our case within about 0.2, of <span class="math inline">\(p\)</span>. Note that 95% is somewhat of an arbitrary choice and sometimes other percentages are used, but it is the most commonly used value to define margin of error. We often round 1.96 up to 2 for simplicity of presentation.</p>
+<p>Hence, there is a 95% probability that <span class="math inline">\(\bar{X}\)</span> will be within <span class="math inline">\(1.96\times \hat{SE}(\bar{X})\)</span>, in our case within about 0.2, of <span class="math inline">\(p\)</span>. Observe that 95% is somewhat of an arbitrary choice and sometimes other percentages are used, but it is the most commonly used value to define margin of error. We often round 1.96 up to 2 for simplicity of presentation.</p>
 <p>In summary, the CLT tells us that our poll based on a sample size of <span class="math inline">\(25\)</span> is not very useful. We don’t really learn much when the margin of error is this large. All we can really say is that the popular vote will not be won by a large margin. This is why pollsters tend to use larger sample sizes.</p>
-<p>From the table above, we see that typical sample sizes range from 700 to 3500. To see how this gives us a much more practical result, notice that if we had obtained a <span class="math inline">\(\bar{X}\)</span>=0.48 with a sample size of 2,000, our standard error <span class="math inline">\(\hat{\mbox{SE}}(\bar{X})\)</span> would have been 0.0111714. So our result is an estimate of <code>48</code>% with a margin of error of 2%. In this case, the result is much more informative and would make us think that there are more red balls than blue. Keep in mind, however, that this is hypothetical. We did not take a poll of 2,000 since we don’t want to ruin the competition.</p>
+<p>From the table above, we see that typical sample sizes range from 700 to 3500. To see how this gives us a much more practical result, consider that if we had obtained a <span class="math inline">\(\bar{X}\)</span>=0.48 with a sample size of 2,000, our standard error <span class="math inline">\(\hat{\mbox{SE}}(\bar{X})\)</span> would have been 0.0111714. So our result is an estimate of <code>48</code>% with a margin of error of 2%. In this case, the result is much more informative and would lead us to believe that there are more red balls than blue. Keep in mind, however, that this is hypothetical. We did not take a poll of 2,000, since we don’t want to ruin the competition.</p>
 <section id="a-monte-carlo-simulation" class="level2" data-number="7.1"><h2 data-number="7.1" class="anchored" data-anchor-id="a-monte-carlo-simulation">
 <span class="header-section-number">7.1</span> A Monte Carlo simulation</h2>
-<p>Suppose we want to use a Monte Carlo simulation to corroborate the tools we have built using probability theory. To create the simulation, we would write code like this:</p>
-<div class="cell" data-layout-align="center" data-hash="clt_cache/html/unnamed-chunk-5_5af08eb29094d41057565c64d879798d">
+<p>Suppose we want to use a Monte Carlo simulation to corroborate the tools we have developed using probability theory. To create the simulation, we would write code like this:</p>
+<div class="cell" data-layout-align="center" data-hash="clt_cache/html/unnamed-chunk-5_2a01aa4455df972414e1d4a4b91a2c6e">
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10000</span></span>
 <span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
 <span><span class="va">x_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="op">{</span></span>
-<span>  <span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span><span class="op">)</span>, size <span class="op">=</span> <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span><span class="op">-</span><span class="va">p</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span>
+<span>  <span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span><span class="op">)</span>, size <span class="op">=</span> <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="op">}</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The problem is, of course, we don’t know <code>p</code>. We could construct an urn like the one pictured above and run an analog (without a computer) simulation. It would take a long time, but you could take 10,000 samples, count the beads and keep track of the proportions of blue. We can use the function <code>take_poll(n=1000)</code> instead of drawing from an actual urn, but it would still take time to count the beads and enter the results.</p>
-<p>One thing we therefore do to corroborate theoretical results is to pick one or several values of <code>p</code> and run the simulations. Let’s set <code>p=0.45</code>. We can then simulate a poll:</p>
+<p>The problem is, of course, that we don’t know <code>p</code>. We could construct an urn, similar to the one pictured above, and conduct an analog simulation (without a computer). While time-consuming, we could take 10,000 samples, count the beads, and track the proportions of blue. We can use the function <code>take_poll(n=1000)</code>, instead of drawing from an actual urn, but it would still take time to count the beads and enter the results.</p>
+<p>Therefore, one approach we can use to corroborate theoretical results is to pick one or several values of <code>p</code> and run simulations. Let’s set <code>p=0.45</code>. We can then simulate a poll:</p>
 <div class="cell" data-layout-align="center" data-hash="clt_cache/html/unnamed-chunk-6_400194b8453af67fe2fc2900a45c0884">
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fl">0.45</span></span>
 <span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
@@ -501,14 +507,14 @@ <h1 class="title"><span id="sec-clt" class="quarto-section-identifier"><span cla
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="op">}</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To review, the theory tells us that <span class="math inline">\(\bar{X}\)</span> is approximately normally distributed, has expected value <span class="math inline">\(p=\)</span> 0.45 and standard error <span class="math inline">\(\sqrt{p(1-p)/N}\)</span> = 0.0157321. The simulation confirms this:</p>
+<p>To review, the theory tells us that <span class="math inline">\(\bar{X}\)</span> is approximately normally distributed, has expected value <span class="math inline">\(p=\)</span> 0.45, and standard error <span class="math inline">\(\sqrt{p(1-p)/N}\)</span> = 0.0157321. The simulation confirms this:</p>
 <div class="cell" data-layout-align="center" data-hash="clt_cache/html/unnamed-chunk-8_71c38a4df4165bbc65de92088cad48a6">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x_hat</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.45</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">x_hat</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 0.0158</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt; [1] 0.0157</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>A histogram and qq-plot confirm that the normal approximation is accurate as well:</p>
+<p>A histogram and qqplot confirm that the normal approximation is also accurate:</p>
 <div class="cell" data-layout-align="center">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -517,25 +523,25 @@ <h1 class="title"><span id="sec-clt" class="quarto-section-identifier"><span cla
 </div>
 </div>
 </div>
-<p>Of course, in real life we would never be able to run such an experiment because we don’t know <span class="math inline">\(p\)</span>. But we could run it for various values of <span class="math inline">\(p\)</span> and <span class="math inline">\(N\)</span> and see that the theory does indeed work well for most values. You can easily do this by re-running the code above after changing <code>p</code> and <code>N</code>.</p>
+<p>Of course, in real life, we would never be able to run such an experiment because we don’t know <span class="math inline">\(p\)</span>. However, we can run it for various values of <span class="math inline">\(p\)</span> and <span class="math inline">\(N\)</span> and see that the theory does indeed work well for most values. You can easily do this by rerunning the code above after changing the values of <code>p</code> and <code>N</code>.</p>
 </section><section id="the-spread" class="level2" data-number="7.2"><h2 data-number="7.2" class="anchored" data-anchor-id="the-spread">
 <span class="header-section-number">7.2</span> The spread</h2>
-<p>The competition is to predict the spread, not the proportion <span class="math inline">\(p\)</span>. However, because we are assuming there are only two parties, we know that the spread is <span class="math inline">\(\mu = p - (1-p) = 2p - 1\)</span>. As a result, everything we have done can easily be adapted to an estimate of <span class="math inline">\(\mu\)</span>. Once we have our estimate <span class="math inline">\(\bar{X}\)</span> and <span class="math inline">\(\hat{\mbox{SE}}(\bar{X})\)</span>, we estimate the spread with <span class="math inline">\(2\bar{X} - 1\)</span> and, since we are multiplying by 2, the standard error is <span class="math inline">\(2\hat{\mbox{SE}}(\bar{X})\)</span>. Note that subtracting 1 does not add any variability so it does not affect the standard error.</p>
-<p>For our 25 item sample above, our estimate <span class="math inline">\(p\)</span> is <code>.48</code> with margin of error <code>.20</code> and our estimate of the spread is <code>0.04</code> with margin of error <code>.40</code>. Again, not a very useful sample size. However, the point is that once we have an estimate and standard error for <span class="math inline">\(p\)</span>, we have it for the spread <span class="math inline">\(\mu\)</span>.</p>
+<p>The objective of the competition is to predict the spread, not the proportion <span class="math inline">\(p\)</span>. However, since we are assuming there are only two parties, we know that the spread is <span class="math inline">\(\mu = p - (1-p) = 2p - 1\)</span>. As a result, everything we have done can easily be adapted to an estimate of <span class="math inline">\(\mu\)</span>. Once we have our estimate <span class="math inline">\(\bar{X}\)</span> and <span class="math inline">\(\hat{\mbox{SE}}(\bar{X})\)</span>, we estimate the spread with <span class="math inline">\(2\bar{X} - 1\)</span> and, since we are multiplying by 2, the standard error is <span class="math inline">\(2\hat{\mbox{SE}}(\bar{X})\)</span>. Note that subtracting 1 does not add any variability, so it does not affect the standard error.</p>
+<p>For our 25 item sample above, our estimate <span class="math inline">\(p\)</span> is <code>.48</code> with margin of error <code>.20, and our estimate of the spread is</code>0.04<code>with margin of error</code>.40`. Again, this is not a very useful sample size. Nevertheless, the point is that, once we have an estimate and standard error for <span class="math inline">\(p\)</span>, we have it for the spread <span class="math inline">\(\mu\)</span>.</p>
 <div class="callout callout-style-simple callout-note">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>We use <span class="math inline">\(\mu\)</span> the denote the spread here and in the next chapters because this is the typical notation used in statistical textbooks for the parameter of interest. The reason we use <span class="math inline">\(\mu\)</span> is because a populuation mean is often the parameter of interest and <span class="math inline">\(\mu\)</span> is the Greek letter for <em>m</em>.</p>
+<p>We use <span class="math inline">\(\mu\)</span> to denote the spread here and in the next sections because this is the typical notation used in statistical textbooks for the parameter of interest. The reason we use <span class="math inline">\(\mu\)</span> is that a population mean is often the parameter of interest, and <span class="math inline">\(\mu\)</span> is the Greek letter for <em>m</em>.</p>
 </div>
 </div>
 </div>
 </section><section id="bias-why-not-run-a-very-large-poll" class="level2" data-number="7.3"><h2 data-number="7.3" class="anchored" data-anchor-id="bias-why-not-run-a-very-large-poll">
-<span class="header-section-number">7.3</span> Bias: why not run a very large poll?</h2>
-<p>For realistic values of <span class="math inline">\(p\)</span>, say from 0.35 to 0.65, if we run a very large poll with 100,000 people, theory tells us that we would predict the election perfectly since the largest possible margin of error is around 0.3%:</p>
-<div class="cell" data-layout-align="center" data-hash="clt_cache/html/standard-error-versus-p_b7ede8afac26355d3329572cfc7f74eb">
+<span class="header-section-number">7.3</span> Bias: Why not run a very large poll?</h2>
+<p>For realistic values of <span class="math inline">\(p\)</span>, let’s say ranging from 0.35 to 0.65, if we conduct a very large poll with 100,000 people, theory tells us that we would predict the election perfectly, as the largest possible margin of error is around 0.3%:</p>
+<div class="cell" data-layout-align="center" data-hash="clt_cache/html/standard-error-versus-p_f9977db05e26b1c3a618a6b3c2cd8883">
 <pre><code>#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.</code></pre>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -544,12 +550,12 @@ <h1 class="title"><span id="sec-clt" class="quarto-section-identifier"><span cla
 </div>
 </div>
 </div>
-<p>One reason is that running such a poll is very expensive. Another possibly more important reason is that theory has its limitations. Polling is much more complicated than picking beads from an urn. Some people might lie to pollsters and others might not have phones. But perhaps the most important way an actual poll differs from an urn model is that we actually don’t know for sure who is in our population and who is not. How do we know who is going to vote? Are we reaching all possible voters? Hence, even if our margin of error is very small, it might not be exactly right that our expected value is <span class="math inline">\(p\)</span>. We call this bias. Historically, we observe that polls are indeed biased, although not by that much. The typical bias appears to be about 1-2%. This makes election forecasting a bit more interesting and we will talk about how to model this in a later chapter.</p>
+<p>One reason is that conducting such a poll is very expensive. Another, and possibly more important reason, is that theory has its limitations. Polling is much more complicated than simply picking beads from an urn. Some people might lie to pollsters, and others might not have phones. However, perhaps the most important way an actual poll differs from an urn model is that we don’t actually know for sure who is in our population and who is not. How do we know who is going to vote? Are we reaching all possible voters? Hence, even if our margin of error is very small, it might not be exactly right that our expected value is <span class="math inline">\(p\)</span>. We call this bias. Historically, we observe that polls are indeed biased, although not by a substantial amount. The typical bias appears to be about 1-2%. This makes election forecasting a bit more interesting, and we will explore how to model this in a later section.</p>
 </section><section id="exercises" class="level2" data-number="7.4"><h2 data-number="7.4" class="anchored" data-anchor-id="exercises">
 <span class="header-section-number">7.4</span> Exercises</h2>
-<p>1. Write an <em>urn model</em> function that takes the proportion of Democrats <span class="math inline">\(p\)</span> and the sample size <span class="math inline">\(N\)</span> as arguments and returns the sample average if Democrats are 1s and Republicans are 0s. Call the function <code>take_sample</code>.</p>
-<p>2. Now assume <code>p &lt;- 0.45</code> and that your sample size is <span class="math inline">\(N=100\)</span>. Take a sample 10,000 times and save the vector of <code>mean(X) - p</code> into an object called <code>errors</code>. Hint: use the function you wrote for exercise 1 to write this in one line of code.</p>
-<p>3. The vector <code>errors</code> contains, for each simulated sample, the difference between the actual <span class="math inline">\(p\)</span> and our estimate <span class="math inline">\(\bar{X}\)</span>. We refer to this difference as the <em>error</em>. Compute the average and make a histogram of the errors generated in the Monte Carlo simulation and select which of the following best describes their distributions:</p>
+<p>1. Write an <em>urn model</em> function that takes the proportion of Democrats <span class="math inline">\(p\)</span> and the sample size <span class="math inline">\(N\)</span> as arguments, and returns the sample average if Democrats are 1s and Republicans are 0s. Call the function <code>take_sample</code>.</p>
+<p>2. Now assume <code>p &lt;- 0.45</code> and that your sample size is <span class="math inline">\(N=100\)</span>. Take a sample 10,000 times and save the vector of <code>mean(X) - p</code> into an object called <code>errors</code>. Hint: Use the function you wrote for exercise 1 to write this in one line of code.</p>
+<p>3. The vector <code>errors</code> contains, for each simulated sample, the difference between the actual <span class="math inline">\(p\)</span> and our estimate <span class="math inline">\(\bar{X}\)</span>. We refer to this difference as the <em>error</em>. Compute the average and make a histogram of the errors generated in the Monte Carlo simulation, and select which of the following best describes their distributions:</p>
 <div class="cell" data-layout-align="center" data-hash="clt_cache/html/unnamed-chunk-9_61194be9cbd9de286d10205551ab9bfe">
 <div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">errors</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">errors</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -560,18 +566,18 @@ <h1 class="title"><span id="sec-clt" class="quarto-section-identifier"><span cla
 <li>The errors are symmetrically distributed around 0.</li>
 <li>The errors range from -1 to 1.</li>
 </ol>
-<p>4. The error <span class="math inline">\(\bar{X}-p\)</span> is a random variable. In practice, the error is not observed because we do not know <span class="math inline">\(p\)</span>. Here we observe it because we constructed the simulation. What is the average size of the error if we define the size by taking the absolute value <span class="math inline">\(\mid \bar{X} - p \mid\)</span> ?</p>
-<p>5. The standard error is related to the typical <strong>size</strong> of the error we make when predicting. We say <strong>size</strong> because we just saw that the errors are centered around 0, so thus the average error value is 0. For mathematical reasons related to the Central Limit Theorem, we actually use the standard deviation of <code>errors</code> rather than the average of the absolute values to quantify the typical size. What is this standard deviation of the errors?</p>
+<p>4. The error <span class="math inline">\(\bar{X}-p\)</span> is a random variable. In practice, the error is not observed because we do not know <span class="math inline">\(p\)</span>. Here, we observe it since we constructed the simulation. What is the average size of the error if we define the size by taking the absolute value <span class="math inline">\(\mid \bar{X} - p \mid\)</span>?</p>
+<p>5. The standard error is related to the typical <strong>size</strong> of the error we make when predicting. For mathematical reasons related to the Central Limit Theorem, we actually use the standard deviation of <code>errors</code>, rather than the average of the absolute values, to quantify the typical size. What is this standard deviation of the errors?</p>
 <p>6. The theory we just learned tells us what this standard deviation is going to be because it is the standard error of <span class="math inline">\(\bar{X}\)</span>. What does theory tell us is the standard error of <span class="math inline">\(\bar{X}\)</span> for a sample size of 100?</p>
 <p>7. In practice, we don’t know <span class="math inline">\(p\)</span>, so we construct an estimate of the theoretical prediction based by plugging in <span class="math inline">\(\bar{X}\)</span> for <span class="math inline">\(p\)</span>. Compute this estimate. Set the seed at 1 with <code>set.seed(1)</code>.</p>
-<p>8. Note how close the standard error estimates obtained from the Monte Carlo simulation (exercise 5), the theoretical prediction (exercise 6), and the estimate of the theoretical prediction (exercise 7) are. The theory is working and it gives us a practical approach to knowing the typical error we will make if we predict <span class="math inline">\(p\)</span> with <span class="math inline">\(\bar{X}\)</span>. Another advantage that the theoretical result provides is that it gives an idea of how large a sample size is required to obtain the precision we need. Earlier we learned that the largest standard errors occur for <span class="math inline">\(p=0.5\)</span>. Create a plot of the largest standard error for <span class="math inline">\(N\)</span> ranging from 100 to 5,000. Based on this plot, how large does the sample size have to be to have a standard error of about 1%?</p>
+<p>8. Note how close the standard error estimates obtained from the Monte Carlo simulation (exercise 5), the theoretical prediction (exercise 6), and the estimate of the theoretical prediction (exercise 7) are. The theory is working and it gives us a practical approach to knowing the typical error we will make if we predict <span class="math inline">\(p\)</span> with <span class="math inline">\(\bar{X}\)</span>. Another advantage that the theoretical result provides is that it gives an idea of how large a sample size is required to obtain the precision we need. Earlier, we learned that the largest standard errors occur for <span class="math inline">\(p=0.5\)</span>. Create a plot of the largest standard error for <span class="math inline">\(N\)</span> ranging from 100 to 5,000. Based on this plot, how large does the sample size have to be to have a standard error of about 1%?</p>
 <ol type="a">
 <li>100</li>
 <li>500</li>
 <li>2,500</li>
 <li>4,000</li>
 </ol>
-<p>9. For sample size <span class="math inline">\(N=100\)</span>, the central limit theorem tells us that the distribution of <span class="math inline">\(\bar{X}\)</span> is:</p>
+<p>9. For sample size <span class="math inline">\(N=100\)</span>, the Central Limit Theorem tells us that the distribution of <span class="math inline">\(\bar{X}\)</span> is:</p>
 <ol type="a">
 <li>practically equal to <span class="math inline">\(p\)</span>.</li>
 <li>approximately normal with expected value <span class="math inline">\(p\)</span> and standard error <span class="math inline">\(\sqrt{p(1-p)/N}\)</span>.</li>
@@ -586,7 +592,7 @@ <h1 class="title"><span id="sec-clt" class="quarto-section-identifier"><span cla
 <li>not a random variable.</li>
 </ol>
 <p>11. To corroborate your answer to exercise 9, make a qq-plot of the <code>errors</code> you generated in exercise 2 to see if they follow a normal distribution.</p>
-<p>12. If <span class="math inline">\(p=0.45\)</span> and <span class="math inline">\(N=100\)</span> as in exercise 2, use the CLT to estimate the probability that <span class="math inline">\(\bar{X}&gt;0.5\)</span>. You can assume you know <span class="math inline">\(p=0.45\)</span> for this calculation.</p>
+<p>12. If <span class="math inline">\(p=0.45\)</span> and <span class="math inline">\(N=100\)</span> as in exercise 2, use the CLT to estimate the probability that <span class="math inline">\(\bar{X}&gt;0.5\)</span>. Assume you know <span class="math inline">\(p=0.45\)</span> for this calculation.</p>
 <p>13. Assume you are in a practical situation and you don’t know <span class="math inline">\(p\)</span>. Take a sample of size <span class="math inline">\(N=100\)</span> and obtain a sample average of <span class="math inline">\(\bar{X} = 0.51\)</span>. What is the CLT approximation for the probability that your error is equal to or larger than 0.01?</p>
 
 
diff --git a/docs/inference/clt_files/figure-html/normal-approximation-for-polls-1.png b/docs/inference/clt_files/figure-html/normal-approximation-for-polls-1.png
index ddd05a5..5a48eeb 100644
Binary files a/docs/inference/clt_files/figure-html/normal-approximation-for-polls-1.png and b/docs/inference/clt_files/figure-html/normal-approximation-for-polls-1.png differ
diff --git a/docs/inference/confidence-intervals.html b/docs/inference/confidence-intervals.html
index a31227b..3fe8f79 100644
--- a/docs/inference/confidence-intervals.html
+++ b/docs/inference/confidence-intervals.html
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -420,7 +426,7 @@ <h1 class="title">
   </div>
   
 
-</header><p>Confidence intervals are a very useful concept widely employed by data analysts. A version of these that are commonly seen come from the <code>ggplot</code> geometry <code>geom_smooth</code>. Here is an example using a temperature dataset available in R:</p>
+</header><p>Confidence intervals are a very useful concept widely employed by data analysts. A version of these that are commonly seen come from the <code>ggplot</code> geometry <code>geom_smooth</code>. Below is an example using a temperature dataset available in R:</p>
 <div class="cell" data-layout-align="center">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -429,38 +435,38 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>In the Machine Learning part we will learn how the curve is formed, but for now consider the shaded area around the curve. This is created using the concept of confidence intervals.</p>
-<p>In our earlier competition, you were asked to give an interval. If the interval you submitted includes the <span class="math inline">\(p\)</span>, you get half the money you spent on your “poll” back and pass to the next stage of the competition. One way to pass to the second round is to report a very large interval. For example, the interval <span class="math inline">\([0,1]\)</span> is guaranteed to include <span class="math inline">\(p\)</span>. However, with an interval this big, we have no chance of winning the competition. Similarly, if you are an election forecaster and predict the spread will be between -100% and 100%, you will be ridiculed for stating the obvious. Even a smaller interval, such as saying the spread will be between -10 and 10%, will not be considered serious.</p>
+<p>In the Machine Learning section, we will learn how the curve is formed, but for now consider the shaded area around the curve. This is created using the concept of confidence intervals.</p>
+<p>In our earlier competition, you were asked to give an interval. If the interval you submitted includes the <span class="math inline">\(p\)</span>, you receive half the money you spent on your “poll” back and proceed to the next stage of the competition. One way to pass to the second round is to report a very large interval. For example, the interval <span class="math inline">\([0,1]\)</span> is guaranteed to include <span class="math inline">\(p\)</span>. However, with an interval this big, we have no chance of winning the competition. Similarly, if you are an election forecaster and predict the spread will be between -100% and 100%, you will be ridiculed for stating the obvious. Even a smaller interval, such as saying the spread will be between -10 and 10%, will not be considered serious.</p>
 <p>On the other hand, the smaller the interval we report, the smaller our chances are of winning the prize. Likewise, a bold pollster that reports very small intervals and misses the mark most of the time will not be considered a good pollster. We want to be somewhere in between.</p>
-<p>We can use the statistical theory we have learned to compute the probability of any given interval including <span class="math inline">\(p\)</span>. If we are asked to create an interval with, say, a 95% chance of including <span class="math inline">\(p\)</span>, we can do that as well. These are called 95% confidence intervals.</p>
-<p>When a pollster reports an estimate and a margin of error, they are, in a way, reporting a 95% confidence interval. Let’s show how this works mathematically.</p>
+<p>We can use the statistical theory we have learned to compute the probability of any given interval including <span class="math inline">\(p\)</span>. If we are asked to create an interval with, say, a 95% chance of including <span class="math inline">\(p\)</span>, we can do that as well; these are called 95% confidence intervals.</p>
+<p>When a pollster reports an estimate and a margin of error, they are, in a way, reporting a 95% confidence interval. Let’s now see how this works mathematically.</p>
 <p>We want to know the probability that the interval <span class="math inline">\([\bar{X} - 2\hat{\mbox{SE}}(\bar{X}), \bar{X} + 2\hat{\mbox{SE}}(\bar{X})]\)</span> contains the true proportion <span class="math inline">\(p\)</span>. First, consider that the start and end of these intervals are random variables: every time we take a sample, they change. To illustrate this, run the Monte Carlo simulation above twice. We use the same parameters as above:</p>
 <div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/unnamed-chunk-1_437a223d175803a009cc732631c9c2c4">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fl">0.45</span></span>
 <span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>And notice that the interval here:</p>
-<div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/unnamed-chunk-2_9a9eebbead4912e51f1c0696c74c8b6a">
-<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span><span class="op">)</span>, size <span class="op">=</span> <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span><span class="op">-</span><span class="va">p</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/unnamed-chunk-2_087098186623a779f69ddc8a0032d137">
+<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span><span class="op">)</span>, size <span class="op">=</span> <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">x_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
-<span><span class="va">se_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">x_hat</span> <span class="op">*</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">x_hat</span><span class="op">)</span> <span class="op">/</span> <span class="va">N</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">x_hat</span> <span class="op">-</span> <span class="fl">1.96</span> <span class="op">*</span> <span class="va">se_hat</span>, <span class="va">x_hat</span> <span class="op">+</span> <span class="fl">1.96</span> <span class="op">*</span> <span class="va">se_hat</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 0.422 0.484</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="va">se_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">x_hat</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">x_hat</span><span class="op">)</span><span class="op">/</span><span class="va">N</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">x_hat</span> <span class="op">-</span> <span class="fl">1.96</span><span class="op">*</span><span class="va">se_hat</span>, <span class="va">x_hat</span> <span class="op">+</span> <span class="fl">1.96</span><span class="op">*</span><span class="va">se_hat</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 0.427 0.489</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>is different from this one:</p>
-<div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/unnamed-chunk-3_1e2f6460c204e57af841e1bdd9b5980d">
-<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span><span class="op">)</span>, size<span class="op">=</span><span class="va">N</span>, replace<span class="op">=</span><span class="cn">TRUE</span>, prob<span class="op">=</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span><span class="op">-</span><span class="va">p</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/unnamed-chunk-3_bf416e6cfaff361e868d6533163cfcd5">
+<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span><span class="op">)</span>, size <span class="op">=</span> <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">x_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
-<span><span class="va">se_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">x_hat</span> <span class="op">*</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">x_hat</span><span class="op">)</span> <span class="op">/</span> <span class="va">N</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">x_hat</span> <span class="op">-</span> <span class="fl">1.96</span> <span class="op">*</span> <span class="va">se_hat</span>, <span class="va">x_hat</span> <span class="op">+</span> <span class="fl">1.96</span> <span class="op">*</span> <span class="va">se_hat</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 0.451 0.513</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="va">se_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">x_hat</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">x_hat</span><span class="op">)</span><span class="op">/</span><span class="va">N</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">x_hat</span> <span class="op">-</span> <span class="fl">1.96</span><span class="op">*</span><span class="va">se_hat</span>, <span class="va">x_hat</span> <span class="op">+</span> <span class="fl">1.96</span><span class="op">*</span><span class="va">se_hat</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 0.467 0.529</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Keep sampling and creating intervals and you will see the random variation.</p>
-<p>To determine the probability that the interval includes <span class="math inline">\(p\)</span>, we need to compute this:</p>
+<p>Keep sampling and creating intervals, and you will see the random variation.</p>
+<p>To determine the probability that the interval includes <span class="math inline">\(p\)</span>, we need to compute the following:</p>
 <p><span class="math display">\[
 \mbox{Pr}\left(\bar{X} - 1.96\hat{\mbox{SE}}(\bar{X}) \leq p \leq \bar{X} + 1.96\hat{\mbox{SE}}(\bar{X})\right)
 \]</span></p>
-<p>By subtracting and dividing the same quantities in all parts of the equation, we get that the above is equivalent to:</p>
+<p>By subtracting and dividing the same quantities in all parts of the equation, we find that the above is equivalent to:</p>
 <p><span class="math display">\[
 \mbox{Pr}\left(-1.96 \leq \frac{\bar{X}- p}{\hat{\mbox{SE}}(\bar{X})} \leq  1.96\right)
 \]</span></p>
@@ -484,14 +490,13 @@ <h1 class="title">
 <span><span class="va">z</span></span>
 <span><span class="co">#&gt; [1] 2.58</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>will achieve this because by definition <code>pnorm(qnorm(0.995))</code> is 0.995 and by symmetry <code>pnorm(1-qnorm(0.995))</code> is 1 - 0.995. As a consequence, we have that:</p>
+<p>will achieve this because by definition <code>pnorm(qnorm(0.995))</code> is 0.995, and by symmetry <code>pnorm(1-qnorm(0.995))</code> is 1 - 0.995. As a consequence, we have that:</p>
 <div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/unnamed-chunk-6_5d1f95aa9201ebc51b6e0fe013cf609e">
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="va">z</span><span class="op">)</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="op">-</span><span class="va">z</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.99</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>is <code>0.995 - 0.005 = 0.99</code>.</p>
-<p>We can use this approach for any probability, not just 0.95 and 0.99. In statistics textbooks, these are usually written for any probability as <span class="math inline">\(1-\alpha\)</span>. We can then obtain the <span class="math inline">\(z\)</span> for the equation above noting using <code>z = qnorm(1 - alpha / 2)</code> because <span class="math inline">\(1 - \alpha/2 - \alpha/2 = 1 - \alpha\)</span>.</p>
-<p>So, for example, for <span class="math inline">\(\alpha=0.05\)</span>, <span class="math inline">\(1 - \alpha/2 = 0.975\)</span> and we get the 1.96 we have been using:</p>
+<p>We can use this approach for any probability, not just 0.95 and 0.99. In statistics textbooks, confidence interval formulas are given for arbitraty probabilities written as <span class="math inline">\(1-\alpha\)</span>. We can obtain the <span class="math inline">\(z\)</span> for the equation above using <code>z = qnorm(1 - alpha / 2)</code> because <span class="math inline">\(1 - \alpha/2 - \alpha/2 = 1 - \alpha\)</span>. So, for example, for <span class="math inline">\(\alpha=0.05\)</span>, <span class="math inline">\(1 - \alpha/2 = 0.975\)</span> and we get the <span class="math inline">\(z=1.96\)</span> we used above:</p>
 <div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/unnamed-chunk-7_cacbc070e1aac0614024f72b48d9135b">
 <div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">qnorm</a></span><span class="op">(</span><span class="fl">0.975</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 1.96</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -499,20 +504,20 @@ <h1 class="title">
 <section id="a-monte-carlo-simulation" class="level2" data-number="8.1"><h2 data-number="8.1" class="anchored" data-anchor-id="a-monte-carlo-simulation">
 <span class="header-section-number">8.1</span> A Monte Carlo simulation</h2>
 <p>We can run a Monte Carlo simulation to confirm that, in fact, a 95% confidence interval includes <span class="math inline">\(p\)</span> 95% of the time.</p>
-<div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/unnamed-chunk-9_dbd831c4d3fdfd3498218ae4dac6a358">
+<div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/unnamed-chunk-9_d7f84991d5528d968e717e7b290d039c">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
 <span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10000</span></span>
 <span><span class="va">inside</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="op">{</span></span>
-<span>  <span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span><span class="op">)</span>, size <span class="op">=</span> <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span><span class="op">-</span><span class="va">p</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span>
+<span>  <span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span><span class="op">)</span>, size <span class="op">=</span> <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span>
 <span>  <span class="va">x_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
-<span>  <span class="va">se_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">x_hat</span> <span class="op">*</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">x_hat</span><span class="op">)</span> <span class="op">/</span> <span class="va">N</span><span class="op">)</span></span>
-<span>  <span class="fu">between</span><span class="op">(</span><span class="va">p</span>, <span class="va">x_hat</span> <span class="op">-</span> <span class="fl">1.96</span> <span class="op">*</span> <span class="va">se_hat</span>, <span class="va">x_hat</span> <span class="op">+</span> <span class="fl">1.96</span> <span class="op">*</span> <span class="va">se_hat</span><span class="op">)</span></span>
+<span>  <span class="va">se_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">x_hat</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">x_hat</span><span class="op">)</span><span class="op">/</span><span class="va">N</span><span class="op">)</span></span>
+<span>  <span class="fu">between</span><span class="op">(</span><span class="va">p</span>, <span class="va">x_hat</span> <span class="op">-</span> <span class="fl">1.96</span><span class="op">*</span><span class="va">se_hat</span>, <span class="va">x_hat</span> <span class="op">+</span> <span class="fl">1.96</span><span class="op">*</span><span class="va">se_hat</span><span class="op">)</span></span>
 <span><span class="op">}</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">inside</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.948</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The following plot shows the first 100 confidence intervals. In this case, we created the simulation so the black line denotes the parameter we are trying to estimate:</p>
-<div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/confidence-interval-coverage_fcaf2188d39a0df8082492a56a220653">
+<div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/confidence-interval-coverage_4ace81b06f16fddea9d34dc0e23ed0a1">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="confidence-intervals_files/figure-html/confidence-interval-coverage-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -520,15 +525,23 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>::: {.callout-note title = “The correct language”}</p>
-<p>When using the theory we described above, it is important to remember that it is the intervals that are random, not <span class="math inline">\(p\)</span>. In the plot above, we can see the random intervals moving around and <span class="math inline">\(p\)</span>, represented with the vertical line, staying in the same place. The proportion of blue in the urn <span class="math inline">\(p\)</span> is not. So the 95% relates to the probability that this random interval falls on top of <span class="math inline">\(p\)</span>. Saying the <span class="math inline">\(p\)</span> has a 95% chance of being between this and that is technically an incorrect statement because <span class="math inline">\(p\)</span> is not random. :::</p>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>When applying the theory we described above, it’s important to remember that it’s the intervals that are random, not <span class="math inline">\(p\)</span>. In the plot above, we can see the random intervals moving around, while the proportion of blue beads in the urn <span class="math inline">\(p\)</span>, represented with the vertical line, remains in the same place. So the 95% relates to the probability that the random interval falls on top of <span class="math inline">\(p\)</span>. Stating that <span class="math inline">\(p\)</span> has a 95% chance of being between this or that is technically incorrect because <span class="math inline">\(p\)</span> is not random.</p>
+</div>
+</div>
+</div>
 </section><section id="exercises" class="level2" data-number="8.2"><h2 data-number="8.2" class="anchored" data-anchor-id="exercises">
 <span class="header-section-number">8.2</span> Exercises</h2>
 <p>For these exercises, we will use actual polls from the 2016 election. You can load the data from the <strong>dslabs</strong> package.</p>
 <div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/unnamed-chunk-10_180ebfb58fcc44ea9e550b8685f6d62f">
 <div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Specifically, we will use all the national polls that ended within one week before the election.</p>
+<p>Specifically, we will use all the national polls that ended within one week prior to the election.</p>
 <div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/unnamed-chunk-11_f3e1e3d337a44000a0357a180b2e9482">
 <div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
 <span><span class="va">polls</span> <span class="op">&lt;-</span> <span class="va">polls_us_election_2016</span> <span class="op">|&gt;</span> </span>
@@ -540,19 +553,19 @@ <h1 class="title">
 <span><span class="va">x_hat</span> <span class="op">&lt;-</span> <span class="va">polls</span><span class="op">$</span><span class="va">rawpoll_clinton</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">/</span><span class="fl">100</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Assume there are only two candidates and construct a 95% confidence interval for the election night proportion <span class="math inline">\(p\)</span>.</p>
-<p>2. Now use <code>dplyr</code> to add a confidence interval as two columns, call them <code>lower</code> and <code>upper</code>, to the object <code>poll</code>. Then use <code>select</code> to show the <code>pollster</code>, <code>enddate</code>, <code>x_hat</code>,<code>lower</code>, <code>upper</code> variables. Hint: define temporary columns <code>x_hat</code> and <code>se_hat</code>.</p>
+<p>2. Now use <code>dplyr</code> to add a confidence interval as two columns, call them <code>lower</code> and <code>upper</code>, to the object <code>poll</code>. Then, use <code>select</code> to show the <code>pollster</code>, <code>enddate</code>, <code>x_hat</code>,<code>lower</code>, <code>upper</code> variables. Hint: Define temporary columns <code>x_hat</code> and <code>se_hat</code>.</p>
 <p>3. The final tally for the popular vote was Clinton 48.2% and Trump 46.1%. Add a column, call it <code>hit</code>, to the previous table stating if the confidence interval included the true proportion <span class="math inline">\(p=0.482\)</span> or not.</p>
 <p>4. For the table you just created, what proportion of confidence intervals included <span class="math inline">\(p\)</span>?</p>
 <p>5. If these confidence intervals are constructed correctly, and the theory holds up, what proportion should include <span class="math inline">\(p\)</span>?</p>
-<p>6. A much smaller proportion of the polls than expected produce confidence intervals containing <span class="math inline">\(p\)</span>. If you look closely at the table, you will see that most polls that fail to include <span class="math inline">\(p\)</span> are underestimating. The reason for this is undecided voters, individuals polled that do not yet know who they will vote for or do not want to say. Because, historically, undecideds divide evenly between the two main candidates on election day, it is more informative to estimate the spread or the difference between the proportion of two candidates <span class="math inline">\(\mu\)</span>, which in this election was <span class="math inline">\(0. 482 - 0.461 = 0.021\)</span>. Assume that there are only two parties and that <span class="math inline">\(\mu = 2p - 1\)</span>, redefine <code>polls</code> as below and re-do exercise 1, but for the difference.</p>
-<div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/unnamed-chunk-13_92c628603dc24e2177cb76fe273d59dd">
+<p>6. A much smaller proportion of the polls than expected produce confidence intervals containing <span class="math inline">\(p\)</span>. If you look closely at the table, you will see that most polls that fail to include <span class="math inline">\(p\)</span> are underestimating. The reason for this is undecided voters, individuals polled that do not yet know who they will vote for or do not want to say. Because, historically, undecideds divide evenly between the two main candidates on election day, it is more informative to estimate the spread or the difference between the proportion of two candidates, <span class="math inline">\(\mu\)</span>, which in this election was <span class="math inline">\(0. 482 - 0.461 = 0.021\)</span>. Assume that there are only two parties and that <span class="math inline">\(\mu = 2p - 1\)</span>. Redefine <code>polls</code> as below and re-do exercise 1, but for the difference.</p>
+<div class="cell" data-layout-align="center" data-hash="confidence-intervals_cache/html/unnamed-chunk-13_e6332386573a4f941482f548cc593951">
 <div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls</span> <span class="op">&lt;-</span> <span class="va">polls_us_election_2016</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">enddate</span> <span class="op">&gt;=</span> <span class="st">"2016-10-31"</span> <span class="op">&amp;</span> <span class="va">state</span> <span class="op">==</span> <span class="st">"U.S."</span><span class="op">)</span>  <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>mu_hat <span class="op">=</span> <span class="va">rawpoll_clinton</span> <span class="op">/</span> <span class="fl">100</span> <span class="op">-</span> <span class="va">rawpoll_trump</span> <span class="op">/</span> <span class="fl">100</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>mu_hat <span class="op">=</span> <span class="va">rawpoll_clinton</span><span class="op">/</span><span class="fl">100</span> <span class="op">-</span> <span class="va">rawpoll_trump</span><span class="op">/</span><span class="fl">100</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>7. Now repeat exercise 3, but for the difference.</p>
 <p>8. Now repeat exercise 4, but for the difference.</p>
-<p>9. Although the proportion of confidence intervals goes up substantially, it is still lower than 0.95. In the next chapter, we learn the reason for this. To motivate this, make a plot of the error, the difference between each poll’s estimate and the actual <span class="math inline">\(mu=0.021\)</span>. Stratify by pollster.</p>
+<p>9. Although the proportion of confidence intervals increases substantially, it is still lower than 0.95. In the next chapter, we learn the reason for this. To motivate this, make a plot of the error, the difference between each poll’s estimate and the actual <span class="math inline">\(mu=0.021\)</span>. Stratify by pollster.</p>
 <p>10. Redo the plot that you made for exercise 9, but only for pollsters that took five or more polls.</p>
 
 
diff --git a/docs/inference/confidence-intervals_files/figure-html/confidence-interval-coverage-1.png b/docs/inference/confidence-intervals_files/figure-html/confidence-interval-coverage-1.png
index 138e464..563870c 100644
Binary files a/docs/inference/confidence-intervals_files/figure-html/confidence-interval-coverage-1.png and b/docs/inference/confidence-intervals_files/figure-html/confidence-interval-coverage-1.png differ
diff --git a/docs/inference/hierarchical-models.html b/docs/inference/hierarchical-models.html
index 0b0db34..5ee5f1a 100644
--- a/docs/inference/hierarchical-models.html
+++ b/docs/inference/hierarchical-models.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 12&nbsp; Hierarchichal Models</title>
+<title>Advanced Data Science - 13&nbsp; Hierarchichal Models</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -99,7 +99,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../inference/intro-inference.html">Statistical inference</a></li><li class="breadcrumb-item"><a href="../inference/hierarchical-models.html"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../inference/intro-inference.html">Statistical inference</a></li><li class="breadcrumb-item"><a href="../inference/hierarchical-models.html"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -224,23 +224,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -257,37 +263,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -304,31 +310,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -345,49 +351,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -400,18 +406,18 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#case-study-election-forecasting" id="toc-case-study-election-forecasting" class="nav-link active" data-scroll-target="#case-study-election-forecasting"><span class="header-section-number">12.1</span> Case study: election forecasting</a></li>
-  <li><a href="#the-general-bias" id="toc-the-general-bias" class="nav-link" data-scroll-target="#the-general-bias"><span class="header-section-number">12.2</span> The general bias</a></li>
-  <li><a href="#mathematical-representations-of-the-hierarchical-model" id="toc-mathematical-representations-of-the-hierarchical-model" class="nav-link" data-scroll-target="#mathematical-representations-of-the-hierarchical-model"><span class="header-section-number">12.3</span> Mathematical representations of the hierarchical model</a></li>
-  <li><a href="#computing-a-posterior-probability" id="toc-computing-a-posterior-probability" class="nav-link" data-scroll-target="#computing-a-posterior-probability"><span class="header-section-number">12.4</span> Computing a posterior probability</a></li>
-  <li><a href="#predicting-the-electoral-college" id="toc-predicting-the-electoral-college" class="nav-link" data-scroll-target="#predicting-the-electoral-college"><span class="header-section-number">12.5</span> Predicting the electoral college</a></li>
-  <li><a href="#forecasting" id="toc-forecasting" class="nav-link" data-scroll-target="#forecasting"><span class="header-section-number">12.6</span> Forecasting</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">12.7</span> Exercises</a></li>
+<li><a href="#case-study-election-forecasting" id="toc-case-study-election-forecasting" class="nav-link active" data-scroll-target="#case-study-election-forecasting"><span class="header-section-number">13.1</span> Case study: election forecasting</a></li>
+  <li><a href="#sec-general-bias" id="toc-sec-general-bias" class="nav-link" data-scroll-target="#sec-general-bias"><span class="header-section-number">13.2</span> The general bias</a></li>
+  <li><a href="#mathematical-representations-of-the-hierarchical-model" id="toc-mathematical-representations-of-the-hierarchical-model" class="nav-link" data-scroll-target="#mathematical-representations-of-the-hierarchical-model"><span class="header-section-number">13.3</span> Mathematical representations of the hierarchical model</a></li>
+  <li><a href="#computing-a-posterior-probability" id="toc-computing-a-posterior-probability" class="nav-link" data-scroll-target="#computing-a-posterior-probability"><span class="header-section-number">13.4</span> Computing a posterior probability</a></li>
+  <li><a href="#predicting-the-electoral-college" id="toc-predicting-the-electoral-college" class="nav-link" data-scroll-target="#predicting-the-electoral-college"><span class="header-section-number">13.5</span> Predicting the electoral college</a></li>
+  <li><a href="#forecasting" id="toc-forecasting" class="nav-link" data-scroll-target="#forecasting"><span class="header-section-number">13.6</span> Forecasting</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">13.7</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/inference/hierarchical-models.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
-<h1 class="title"><span id="sec-election-forecasting" class="quarto-section-identifier"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></h1>
+<h1 class="title"><span id="sec-election-forecasting" class="quarto-section-identifier"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></h1>
 </div>
 
 
@@ -424,11 +430,11 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
   </div>
   
 
-</header><p>Hierarchical models are useful for quantifying different levels of variability or uncertainty. One can use them using a Bayesian or a Frequentist framework. However because in the Frequentist framework they often extend a model with a fixed parameter by assuming the parameter is actually random, the model description includes two distribution that look like the prior and a sampling distribution used in the Bayesian framework, making the resulting summaries very similar or even equal to what is obtained with a Bayesian context. A key difference between the Bayesian and the Frequentist hierarchical model approach is that in the latter we use data to construct priors rather than treat priors as a quantification of prior expert knowledge. Here illustrate the use of hiereachical models with an example from sports, in which dedicated fans, intuitively apply the ideas of hierarchical models to manage expectations when a new player is of to an exceptionally good start.</p>
-<section id="case-study-election-forecasting" class="level2" data-number="12.1"><h2 data-number="12.1" class="anchored" data-anchor-id="case-study-election-forecasting">
-<span class="header-section-number">12.1</span> Case study: election forecasting</h2>
-<p>Since the 2008 elections, organizations other than FiveThirtyEight have started their own election forecasting groups that also aggregate polling data and uses statistical models to make predictions. However, in 2016 forecasters underestimated Trump’s chances of winning greatly. The day before the election the <em>New York Times</em> reported<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> the following probabilities for Hillary Clinton winning the presidency:</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-2_c10dec03a32deaf80920080c50c1bec3">
+</header><p>Hierarchical models are useful for quantifying different levels of variability or uncertainty. One can use them using a Bayesian or Frequentist framework. However, because in the Frequentist framework they often extend a model with a fixed parameter by assuming the parameter is actually random, the model description includes two distributions that look like the prior and a sampling distribution used in the Bayesian framework. This makes the resulting summaries very similar or even equal to what is obtained with a Bayesian context. A key difference between the Bayesian and the Frequentist hierarchical model approach is that, in the latter, we use data to construct priors rather than treat priors as a quantification of prior expert knowledge. In this section, we illustrate the use of hierarchical models with an example from sports, in which dedicated fans intuitively apply the ideas of hierarchical models to manage expectations when a new player is off to an exceptionally good start.</p>
+<section id="case-study-election-forecasting" class="level2" data-number="13.1"><h2 data-number="13.1" class="anchored" data-anchor-id="case-study-election-forecasting">
+<span class="header-section-number">13.1</span> Case study: election forecasting</h2>
+<p>Since the 2008 elections, organizations other than FiveThirtyEight have started their own election forecasting groups that also aggregate polling data and uses statistical models to make predictions. However, in 2016, forecasters underestimated Trump’s chances of winning greatly. The day before the election, the <em>New York Times</em> reported<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> the following probabilities for Hillary Clinton winning the presidency:</p>
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-2_3f1cf50de1796edcb68b575ae4a7faa2">
 <div class="cell-output-display">
 <table class="table table-striped table-sm small" data-quarto-postprocess="true">
 <thead><tr class="header">
@@ -456,39 +462,39 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 </table>
 </div>
 </div>
-<p>For example, the Princeton Election Consortium (PEC) gave Trump less than 1% chance of winning, while the Huffington Post gave him a 2% chance. In contrast, FiveThirtyEight had Trump’s probability of winning at 29%, substantially higher than the others. In fact, four days before the election FiveThirtyEight published an article titled <em>Trump Is Just A Normal Polling Error Behind Clinton</em><a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>.</p>
-<p>So why did FiveThirtyEight’s model fair so much better than others? How could PEC and Huffington Post get it so wrong if they were using the same data? In this chapter we describe how FiveThirtyEight used a hierarchical model to correctly account for key sources of variability and outperform all other forecasters. For illustrative purposes we will cotinue examining our popular vote example. In the final section we then describe the more complex approach used to forecast the electoral college result.</p>
-</section><section id="the-general-bias" class="level2" data-number="12.2"><h2 data-number="12.2" class="anchored" data-anchor-id="the-general-bias">
-<span class="header-section-number">12.2</span> The general bias</h2>
-<p>In the previous chapter we computed the posterior probability of Hillary Clinton winning the popular vote with a standard Bayesian analysis and found it to be very close to 100%. However, FiveThirtyEight gave her a 81.4% chance<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>. What explains this difference? Below we describe the <em>general bias</em>, another source of variability, included in the FiveThirtyEight model, that accounts for the difference.</p>
-<p>After elections are over, one can look at the difference between pollster predictions and actual result. An important observation that our initial models did not take into account is that it is common to see a general bias that affects most pollsters in the same way making the observed data correlated. There is no agreed upon explanation for this, but we do observe it in historical data: in one election, the average of polls favors Democrats by 2%, then in the following election they favor Republicans by 1%, then in the next election there is no bias, then in the following one Republicans are favored by 3%, and so on. In 2016, the polls were biased in favor of the Democrats by 1-2%.</p>
-<p>However, although we know this bias term affects our polls, we have no way of knowing what this bias is until election night. So we can’t correct our polls accordingly. What we can do is include a term in our model that accounts for the variability.</p>
-</section><section id="mathematical-representations-of-the-hierarchical-model" class="level2" data-number="12.3"><h2 data-number="12.3" class="anchored" data-anchor-id="mathematical-representations-of-the-hierarchical-model">
-<span class="header-section-number">12.3</span> Mathematical representations of the hierarchical model</h2>
-<p>Suppose we are collecting data from one pollster and we assume there is no general bias. The pollster collects several polls with a sample size of <span class="math inline">\(N\)</span>, so we observe several measurements of the spread <span class="math inline">\(X_1, \dots, X_J\)</span>. Suppose the real proportion for Hillary is <span class="math inline">\(p\)</span> and the difference is <span class="math inline">\(\mu\)</span>. The urn model theory tells us that these random variables are normally distributed with expected value <span class="math inline">\(\mu\)</span> and standard error <span class="math inline">\(2 \sqrt{p(1-p)/N}\)</span>:</p>
+<p>Meanwhile, the Princeton Election Consortium (PEC) gave Trump less than 1% chance of winning, while the Huffington Post gave him a 2% chance. In contrast, FiveThirtyEight had Trump’s probability of winning at 29%, substantially higher than the others. In fact, four days before the election, FiveThirtyEight published an article titled <em>Trump Is Just A Normal Polling Error Behind Clinton</em><a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>.</p>
+<p>So why did FiveThirtyEight’s model fair so much better than others? How could PEC and Huffington Post get it so wrong if they were using the same data? In this chapter, we describe how FiveThirtyEight used a hierarchical model to correctly account for key sources of variability and outperform all other forecasters. For illustrative purposes, we will continue examining our popular vote example. In the final section, we will describe the more complex approach used to forecast the electoral college result.</p>
+</section><section id="sec-general-bias" class="level2" data-number="13.2"><h2 data-number="13.2" class="anchored" data-anchor-id="sec-general-bias">
+<span class="header-section-number">13.2</span> The general bias</h2>
+<p>In the previous chapter, we computed the posterior probability of Hillary Clinton winning the popular vote with a standard Bayesian analysis and found it to be very close to 100%. However, FiveThirtyEight gave her a 81.4% chance<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>. What explains this difference? Below, we describe the <em>general bias</em>, another source of variability, included in the FiveThirtyEight model, that accounts for the difference.</p>
+<p>After elections are over, one can look at the difference between the pollster predictions and the actual result. An important observation, that our initial models did not take into account, is that it is common to see a general bias that affects most pollsters in the same way, making the observed data correlated. There is no agreed upon explanation for this, but we do observe it in historical data: in one election, the average of polls favors Democrats by 2%; then in the following election, they favor Republicans by 1%; then in the next election there is no bias; then in the following one Republicans are favored by 3%, and so on. In 2016, the polls were biased in favor of the Democrats by 1-2%.</p>
+<p>Although we know this bias term affects our polls, we have no way of knowing what this bias is until election night. So we can’t correct our polls accordingly. What we can do is include a term in our model that accounts for the variability.</p>
+</section><section id="mathematical-representations-of-the-hierarchical-model" class="level2" data-number="13.3"><h2 data-number="13.3" class="anchored" data-anchor-id="mathematical-representations-of-the-hierarchical-model">
+<span class="header-section-number">13.3</span> Mathematical representations of the hierarchical model</h2>
+<p>Suppose we are collecting data from one pollster and we assume there is no general bias. The pollster collects several polls with a sample size of <span class="math inline">\(N\)</span>, so we observe several measurements of the spread <span class="math inline">\(X_1, \dots, X_J\)</span>. Suppose the real proportion for Hillary is <span class="math inline">\(p\)</span> and the difference is <span class="math inline">\(\mu\)</span>. The urn model theory tells us that these random variables are normally distributed, with expected value <span class="math inline">\(\mu\)</span> and standard error <span class="math inline">\(2 \sqrt{p(1-p)/N}\)</span>:</p>
 <p><span class="math display">\[
 X_j \sim \mbox{N}\left(\mu, \sqrt{p(1-p)/N}\right)
 \]</span></p>
-<p>We use the index <span class="math inline">\(j\)</span> to represent the different polls conducted by this pollster. Here is a simulation for six polls assuming the spread is 2.1 and <span class="math inline">\(N\)</span> is 2,000:</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-3_24a209b4d478057b8b72b9459abce401">
+<p>We use the index <span class="math inline">\(j\)</span> to represent the different polls conducted by this pollster. Below is a simulation for six polls assuming the spread is 2.1 and <span class="math inline">\(N\)</span> is 2,000:</p>
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-3_d3f95a187f7a8c5c9394aab12203c827">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">3</span><span class="op">)</span></span>
 <span><span class="va">J</span> <span class="op">&lt;-</span> <span class="fl">6</span></span>
 <span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">2000</span></span>
 <span><span class="va">mu</span> <span class="op">&lt;-</span> <span class="fl">.021</span></span>
 <span><span class="va">p</span> <span class="op">&lt;-</span> <span class="op">(</span><span class="va">mu</span> <span class="op">+</span> <span class="fl">1</span><span class="op">)</span><span class="op">/</span><span class="fl">2</span></span>
-<span><span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">J</span>, <span class="va">mu</span>, <span class="fl">2</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">p</span> <span class="op">*</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p</span><span class="op">)</span> <span class="op">/</span> <span class="va">N</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">J</span>, <span class="va">mu</span>, <span class="fl">2</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">p</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p</span><span class="op">)</span><span class="op">/</span><span class="va">N</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Now suppose we have <span class="math inline">\(J=6\)</span> polls from each of <span class="math inline">\(I=5\)</span> different pollsters. For simplicity, let’s say all polls had the same sample size <span class="math inline">\(N\)</span>. The urn model tell us the distribution is the same for all pollsters so to simulate data, we use the same model for each:</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-4_0fc7e9b36357ea47483d6503ead638de">
+<p>Now, suppose we have <span class="math inline">\(J=6\)</span> polls from each of <span class="math inline">\(I=5\)</span> different pollsters. For simplicity, let’s say all polls had the same sample size <span class="math inline">\(N\)</span>. The urn model tell us the distribution is the same for all pollsters, so to simulate data, we use the same model for each:</p>
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-4_4c2b19957f62d2c72ec8de5d3e17fa27">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">I</span> <span class="op">&lt;-</span> <span class="fl">5</span></span>
 <span><span class="va">J</span> <span class="op">&lt;-</span> <span class="fl">6</span></span>
 <span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">2000</span></span>
 <span><span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="va">I</span>, <span class="kw">function</span><span class="op">(</span><span class="va">i</span><span class="op">)</span><span class="op">{</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">J</span>, <span class="va">mu</span>, <span class="fl">2</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">p</span> <span class="op">*</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p</span><span class="op">)</span> <span class="op">/</span> <span class="va">N</span><span class="op">)</span><span class="op">)</span></span>
+<span>  <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">J</span>, <span class="va">mu</span>, <span class="fl">2</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">p</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p</span><span class="op">)</span><span class="op">/</span><span class="va">N</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="op">}</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>As expected, the simulated data does not really seem to capture the features of the actual data because it does not account for pollster-to-pollster variability:</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/simulated-data-without-bias_f2ba0206150b25759585a9952f00771b">
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/simulated-data-without-bias_30a9e88758802f904334146ece2e122e">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="hierarchical-models_files/figure-html/simulated-data-without-bias-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -496,23 +502,23 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 </div>
 </div>
 </div>
-<p>To fix this, we need to represent the two levels of variability and we need two indexes, one for pollster and one for the polls each pollster takes. We use <span class="math inline">\(X_{ij}\)</span> with <span class="math inline">\(i\)</span> representing the pollster and <span class="math inline">\(j\)</span> representing the <span class="math inline">\(j\)</span>-th poll from that pollster. The model is now augmented to include pollster effects <span class="math inline">\(h_i\)</span>, referred to as house effects by FiveThirtyEight, with standard deviation <span class="math inline">\(\sigma_h\)</span>:</p>
+<p>To fix this, we need to represent the two levels of variability and we need two indexes, one for pollster and one for the polls each pollster takes. We use <span class="math inline">\(X_{ij}\)</span> with <span class="math inline">\(i\)</span> representing the pollster and <span class="math inline">\(j\)</span> representing the <span class="math inline">\(j\)</span>-th poll from that pollster. The model is now augmented to include pollster effects <span class="math inline">\(h_i\)</span>, referred to as “house effects” by FiveThirtyEight, with standard deviation <span class="math inline">\(\sigma_h\)</span>:</p>
 <p><span class="math display">\[
 \begin{aligned}
 h_i &amp;\sim \mbox{N}\left(0, \sigma_h\right)\\
 X_{i,j} \mid h_i &amp;\sim \mbox{N}\left(\mu + h_i, \sqrt{p(1-p)/N}\right)
 \end{aligned}
 \]</span></p>
-<p>To simulate data from a specific pollster, we first need to draw an <span class="math inline">\(h_i\)</span> and the generate individual poll data after adding this effect. Here is how we would do it for one specific pollster. We assume <span class="math inline">\(\sigma_h\)</span> is 0.025:</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-5_54ef6c6530f092aa770c6eda73e3d8e2">
+<p>To simulate data from a specific pollster, we first need to draw an <span class="math inline">\(h_i\)</span>, and then generate individual poll data after adding this effect. Here is how we would do it for one specific pollster. We assume <span class="math inline">\(\sigma_h\)</span> is 0.025:</p>
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-5_8ac62502c45719da83bbde596c05e3f9">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">I</span> <span class="op">&lt;-</span> <span class="fl">5</span></span>
 <span><span class="va">J</span> <span class="op">&lt;-</span> <span class="fl">6</span></span>
 <span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">2000</span></span>
 <span><span class="va">mu</span> <span class="op">&lt;-</span> <span class="fl">.021</span></span>
-<span><span class="va">p</span> <span class="op">&lt;-</span> <span class="op">(</span><span class="va">mu</span> <span class="op">+</span> <span class="fl">1</span><span class="op">)</span> <span class="op">/</span> <span class="fl">2</span></span>
+<span><span class="va">p</span> <span class="op">&lt;-</span> <span class="op">(</span><span class="va">mu</span> <span class="op">+</span> <span class="fl">1</span><span class="op">)</span><span class="op">/</span><span class="fl">2</span></span>
 <span><span class="va">h</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">I</span>, <span class="fl">0</span>, <span class="fl">0.025</span><span class="op">)</span></span>
 <span><span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="va">I</span>, <span class="kw">function</span><span class="op">(</span><span class="va">i</span><span class="op">)</span><span class="op">{</span></span>
-<span>  <span class="va">mu</span> <span class="op">+</span> <span class="va">h</span><span class="op">[</span><span class="va">i</span><span class="op">]</span> <span class="op">+</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">J</span>, <span class="fl">0</span>, <span class="fl">2</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">p</span> <span class="op">*</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p</span><span class="op">)</span> <span class="op">/</span> <span class="va">N</span><span class="op">)</span><span class="op">)</span></span>
+<span>  <span class="va">mu</span> <span class="op">+</span> <span class="va">h</span><span class="op">[</span><span class="va">i</span><span class="op">]</span> <span class="op">+</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">J</span>, <span class="fl">0</span>, <span class="fl">2</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">p</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p</span><span class="op">)</span><span class="op">/</span><span class="va">N</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="op">}</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The simulated data now looks more like the actual data:</p>
@@ -525,7 +531,7 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 </div>
 </div>
 <p>Note that <span class="math inline">\(h_i\)</span> is common to all the observed spreads from a specific pollster. Different pollsters have a different <span class="math inline">\(h_i\)</span>, which explains why we can see the groups of points shift up and down from pollster to pollster.</p>
-<p>Now, in the model above, we assume the average house effect is 0. We think that for every pollster biased in favor of our party, there is another one in favor of the other and assume the standard deviation is <span class="math inline">\(\sigma_h\)</span>. But historically we see that every election has a general bias affecting all polls. We can observe this with the 2016 data, but if we collect historical data, we see that the average of polls misses by more than models like the one above predict. To see this, we would take the average of polls for each election year and compare it to the actual value. If we did this, we would see a difference with a standard deviation of between 2-3%. To incorporate this into the model, we can add another level account for this variability:</p>
+<p>Now, in the model above, we assume the average house effect is 0. We think that for every pollster biased in favor of our party, there is another one in favor of the other, and assume the standard deviation is <span class="math inline">\(\sigma_h\)</span>. But, historically, we see that every election has a general bias affecting all polls. We can observe this with the 2016 data, but if we collect historical data, we see that the average of polls misses by more than models like the one above predict. To see this, we would take the average of polls for each election year and compare it to the actual value. If we did this, we would see a difference with a standard deviation of between 2-3%. To account for this variability we can add another level to the model as follows:</p>
 <p><span class="math display">\[
 \begin{aligned}
 b &amp;\sim \mbox{N}\left(0, \sigma_b\right)\\
@@ -534,21 +540,21 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 \end{aligned}
 \]</span></p>
 <p>This model accounts for three levels of variability: 1) variability in the bias observed from election to election, quantified by <span class="math inline">\(\sigma_b\)</span>, 2) pollster-to-pollster or house effect variability, quantified by <span class="math inline">\(\sigma_h\)</span>, and 3) poll sampling variability, which we can derive to be <span class="math inline">\(\sqrt(p(1-p)/N)\)</span>.</p>
-<p>Note that not including a term like <span class="math inline">\(b\)</span> in the models, is what led many forecasters to be overconfident. This random variable changes from election to election, but for any given election, it is the same for all pollsters and polls within on election (note it does not have an index). This implies we can’t estimate <span class="math inline">\(\sigma_h\)</span> with data from just one election. It also implies that the random variables <span class="math inline">\(X_{i,j}\)</span> for a fixed election year share the same <span class="math inline">\(b\)</span> and are therefore correlated.</p>
-<p>One way to interpret <span class="math inline">\(b\)</span> is as the difference between the average of all polls from all pollsters and the actual result of the election. Because we don’t know the actual result until after the election, we can’t estimate <span class="math inline">\(b\)</span> until after the election.</p>
-</section><section id="computing-a-posterior-probability" class="level2" data-number="12.4"><h2 data-number="12.4" class="anchored" data-anchor-id="computing-a-posterior-probability">
-<span class="header-section-number">12.4</span> Computing a posterior probability</h2>
+<p>Note that not including a term like <span class="math inline">\(b\)</span> in the models is what led many forecasters to be overconfident. This random variable changes from election to election, but for any given election, it is the same for all pollsters and polls within one election (note it does not have an index). This implies that we can’t estimate <span class="math inline">\(\sigma_h\)</span> with data from just one election. It also implies that the random variables <span class="math inline">\(X_{i,j}\)</span> for a fixed election year share the same <span class="math inline">\(b\)</span> and are therefore correlated.</p>
+<p>One way to interpret <span class="math inline">\(b\)</span> is as the difference between the average of all polls from all pollsters and the actual result of the election. Since we don’t know the actual result until after the election, we can’t estimate <span class="math inline">\(b\)</span> until then.</p>
+</section><section id="computing-a-posterior-probability" class="level2" data-number="13.4"><h2 data-number="13.4" class="anchored" data-anchor-id="computing-a-posterior-probability">
+<span class="header-section-number">13.4</span> Computing a posterior probability</h2>
 <div class="callout callout-style-simple callout-warning">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>Some of the results presented in this section rely on calculations of the statistical properties of summaries based on correlated random variables. The learn about the related mathematical details we skip in this book, please consult a textbook on hierarchical models.</p>
+<p>Some of the results presented in this section rely on calculations of the statistical properties of summaries based on correlated random variables. To learn about the related mathematical details we skip in this book, please consult a textbook on hierarchical models.</p>
 </div>
 </div>
 </div>
-<p>Now let’s fit the model above to data. We will use the same data used in the previous chapters and saved in <code>one_poll_per_pollster</code>.</p>
+<p>Now, let’s fit the model above to data. We will use the same data used in the previous chapters and saved in <code>one_poll_per_pollster</code>.</p>
 <div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-6_ac3d18ccacdd0b10e03aba1d09f71f80">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls</span> <span class="op">&lt;-</span> <span class="va">polls_us_election_2016</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/stats/filter.html">filter</a></span><span class="op">(</span><span class="va">state</span> <span class="op">==</span> <span class="st">"U.S."</span> <span class="op">&amp;</span> <span class="va">enddate</span> <span class="op">&gt;=</span> <span class="st">"2016-10-31"</span> <span class="op">&amp;</span></span>
@@ -559,45 +565,45 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 <span>  <span class="fu"><a href="https://rdrr.io/r/stats/filter.html">filter</a></span><span class="op">(</span><span class="va">enddate</span> <span class="op">==</span> <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="va">enddate</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu">ungroup</span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Here we have just one poll per pollster so we will drop the <span class="math inline">\(j\)</span> index and represent the data as before with <span class="math inline">\(X_1, \dots, X_I\)</span>. As a reminder we have data from <span class="math inline">\(I=15\)</span> pollsters. Based on the model assumptions described above, we can mathematically show that the average <span class="math inline">\(\bar{X}\)</span></p>
+<p>Here, we have just one poll per pollster, so we will drop the <span class="math inline">\(j\)</span> index and represent the data as before with <span class="math inline">\(X_1, \dots, X_I\)</span>. As a reminder, we have data from <span class="math inline">\(I=15\)</span> pollsters. Based on the model assumptions described above, we can mathematically show that the average <span class="math inline">\(\bar{X}\)</span>:</p>
 <div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-7_a074df8a857aedad74a0d2ce833bb88b">
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x_bar</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">one_poll_per_pollster</span><span class="op">$</span><span class="va">spread</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>has expected value <span class="math inline">\(\mu\)</span>, thus it provides an unbiased estimate of the outcome of interest. However, how precise is this estimate? Can we use the observed stample standard deviation to construct an estimate of the standard error of <span class="math inline">\(\bar{X}\)</span>?</p>
-<p>It turns out that, because the <span class="math inline">\(X_i\)</span> are correlated, estimating the standard error is more complex than what we have described up to now. Specifically, using advanced statistical calculations not shown here, we can show that the typical variance (standard error squared) estimate</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-8_9eeb3264a1046f9736b5f3785b321214">
-<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">s2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">one_poll_per_pollster</span>, <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span><span class="op">^</span><span class="fl">2</span> <span class="op">/</span> <span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>has expected value <span class="math inline">\(\mu\)</span>; thus, it provides an unbiased estimate of the outcome of interest. But how precise is this estimate? Can we use the observed sample standard deviation to construct an estimate of the standard error of <span class="math inline">\(\bar{X}\)</span>?</p>
+<p>It turns out that, because the <span class="math inline">\(X_i\)</span> are correlated, estimating the standard error is more complex than what we have described up to now. Specifically, using advanced statistical calculations not shown here, we can show that the typical variance (standard error squared) estimate:</p>
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-8_2148806bbf378f71239f5bdebd3111cb">
+<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">s2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">one_poll_per_pollster</span>, <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span><span class="op">^</span><span class="fl">2</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>will consistently underestimate the true standard error by about <span class="math inline">\(\sigma_b^2\)</span>. And, as mentioned earlier, to estimate <span class="math inline">\(\sigma_b\)</span>, we need data from several elections. By collecting and analyzing polling data from several elections, FiveThirtyEight estimates this variability and find that <span class="math inline">\(\sigma_b \approx 0.025\)</span>. We can therefore greatly improve our standard error estimate by adding this quantity:</p>
+<p>will consistently underestimate the true standard error by about <span class="math inline">\(\sigma_b^2\)</span>. And, as mentioned earlier, to estimate <span class="math inline">\(\sigma_b\)</span>, we need data from several elections. By collecting and analyzing polling data from several elections, FiveThirtyEight estimates this variability and finds that <span class="math inline">\(\sigma_b \approx 0.025\)</span>. We can therefore greatly improve our standard error estimate by adding this quantity:</p>
 <div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-9_c019e11b3790399f19fbf12b3ec52d0f">
 <div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">sigma_b</span> <span class="op">&lt;-</span> <span class="fl">0.025</span></span>
 <span><span class="va">se</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">s2</span> <span class="op">+</span> <span class="va">sigma_b</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>If we redo the Bayesian calculation taking this variability into account, we get a result much closer to FiveThirtyEight’s:</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-10_44850a06ca8587d2204cb9585d2b96b9">
+<p>If we redo the Bayesian calculation taking this variability into account, we obtain a result much closer to FiveThirtyEight’s:</p>
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-10_66ef10116a01fab2fec11814b77afc47">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mu</span> <span class="op">&lt;-</span> <span class="fl">0</span></span>
 <span><span class="va">tau</span> <span class="op">&lt;-</span> <span class="fl">0.035</span></span>
-<span><span class="va">B</span> <span class="op">&lt;-</span> <span class="va">se</span><span class="op">^</span><span class="fl">2</span> <span class="op">/</span> <span class="op">(</span><span class="va">se</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span></span>
-<span><span class="va">posterior_mean</span> <span class="op">&lt;-</span> <span class="va">B</span><span class="op">*</span><span class="va">mu</span> <span class="op">+</span> <span class="op">(</span><span class="fl">1</span><span class="op">-</span><span class="va">B</span><span class="op">)</span><span class="op">*</span><span class="va">x_bar</span></span>
-<span><span class="va">posterior_se</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span> <span class="fl">1</span><span class="op">/</span> <span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="va">se</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="fl">1</span><span class="op">/</span><span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">B</span> <span class="op">&lt;-</span> <span class="va">se</span><span class="op">^</span><span class="fl">2</span><span class="op">/</span><span class="op">(</span><span class="va">se</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span></span>
+<span><span class="va">posterior_mean</span> <span class="op">&lt;-</span> <span class="va">B</span><span class="op">*</span><span class="va">mu</span> <span class="op">+</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">B</span><span class="op">)</span><span class="op">*</span><span class="va">x_bar</span></span>
+<span><span class="va">posterior_se</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="va">se</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="fl">1</span><span class="op">/</span><span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
 <span></span>
 <span><span class="fl">1</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="fl">0</span>, <span class="va">posterior_mean</span>, <span class="va">posterior_se</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.817</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Note that by accounting for the general bias term, our Bayesian analysis now produces a posterior probability similar to that reported by FiveThirtyEight.</p>
+<p>Notice that by accounting for the general bias term, our Bayesian analysis now produces a posterior probability similar to that reported by FiveThirtyEight.</p>
 <div class="callout callout-style-simple callout-note">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>Note that we are simplifying FiveThirtyEight’s calculations related to the general bias <span class="math inline">\(b\)</span>. For example, one of the many ways their analysis is more complex than the one presented here, is that they permit <span class="math inline">\(b\)</span> vary across regions of the country. This helps because historically, we have observed geographical patterns in voting behaviors.</p>
+<p>Keep in mind that we are simplifying FiveThirtyEight’s calculations related to the general bias <span class="math inline">\(b\)</span>. For example, one of the many ways their analysis is more complex than the one presented here is that FiveThirtyEight permits <span class="math inline">\(b\)</span> to vary across regions of the country. This helps because, historically, we have observed geographical patterns in voting behaviors.</p>
 </div>
 </div>
 </div>
-</section><section id="predicting-the-electoral-college" class="level2" data-number="12.5"><h2 data-number="12.5" class="anchored" data-anchor-id="predicting-the-electoral-college">
-<span class="header-section-number">12.5</span> Predicting the electoral college</h2>
-<p>Up to now we have focused on the popular vote. But in the United States, elections are not decided by the popular vote but rather by what is known as the electoral college. Each state gets a number of electoral votes that depends, in a somewhat complex way, on the population size of the state. Here are the top 5 states ranked by electoral votes in 2016.</p>
+</section><section id="predicting-the-electoral-college" class="level2" data-number="13.5"><h2 data-number="13.5" class="anchored" data-anchor-id="predicting-the-electoral-college">
+<span class="header-section-number">13.5</span> Predicting the electoral college</h2>
+<p>Up to now, we have focused on the popular vote. However, in the United States, elections are not decided by the popular vote but rather by what is known as the electoral college. Each state gets a number of electoral votes that depends, in a somewhat complex way, on the population size of the state. Here are the top 5 states ranked by electoral votes in 2016:</p>
 <div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-11_436e57e76cd725d2271ceb8b17b2b99b">
 <div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">results_us_election_2016</span> <span class="op">|&gt;</span> <span class="fu">top_n</span><span class="op">(</span><span class="fl">5</span>, <span class="va">electoral_votes</span><span class="op">)</span></span>
 <span><span class="co">#&gt;          state electoral_votes clinton trump others</span></span>
@@ -608,16 +614,16 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 <span><span class="co">#&gt; 5     Illinois              20    55.8  38.8    5.4</span></span>
 <span><span class="co">#&gt; 6 Pennsylvania              20    47.9  48.6    3.6</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>With some minor exceptions we don’t discuss, the electoral votes are won all or nothing. For example, if you won California in 2016 by just 1 vote, you still get all 55 of its electoral votes. This means that by winning a few big states by a large margin, but losing many small states by small margins, you can win the popular vote and yet lose the electoral college. This happened in 1876, 1888, 2000, and 2016. The idea behind this is to avoid a few large states having the power to dominate the presidential election.</p>
+<p>With some minor exceptions we won’t discuss, the electoral votes are won on an all-or-nothing basis. For example, if you won California in 2016 by just 1 vote, you still get all 55 of its electoral votes. This means that by winning a few big states by a large margin, but losing many small states by small margins, you can win the popular vote and yet lose the electoral college. This happened in 1876, 1888, 2000, and 2016. The idea behind this is to prevent a few large states from having the power to dominate the presidential election.</p>
 <div class="callout-not">
 <p>Many people in the US consider the electoral college unfair and would like to see it abolished in favor of the popular vote.</p>
 </div>
 <p>We are now ready to predict the electoral college result for 2016. We start by aggregating results from a poll taken during the last week before the election. We use the <code>grepl</code>, which finds strings in character vectors, to remove polls that are not for entire states.</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-12_7434b04a65224232446571a23c00467a">
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-12_b95ea277303f1106ee909e17104ba45b">
 <div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">results</span> <span class="op">&lt;-</span> <span class="va">polls_us_election_2016</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/stats/filter.html">filter</a></span><span class="op">(</span><span class="va">state</span><span class="op">!=</span><span class="st">"U.S."</span> <span class="op">&amp;</span> </span>
+<span>  <span class="fu"><a href="https://rdrr.io/r/stats/filter.html">filter</a></span><span class="op">(</span><span class="va">state</span> <span class="op">!=</span> <span class="st">"U.S."</span> <span class="op">&amp;</span> </span>
 <span>           <span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/grep.html">grepl</a></span><span class="op">(</span><span class="st">"CD"</span>, <span class="va">state</span><span class="op">)</span> <span class="op">&amp;</span> </span>
-<span>           <span class="va">enddate</span> <span class="op">&gt;=</span><span class="st">"2016-10-31"</span> <span class="op">&amp;</span> </span>
+<span>           <span class="va">enddate</span> <span class="op">&gt;=</span> <span class="st">"2016-10-31"</span> <span class="op">&amp;</span> </span>
 <span>           <span class="op">(</span><span class="va">grade</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"A+"</span>,<span class="st">"A"</span>,<span class="st">"A-"</span>,<span class="st">"B+"</span><span class="op">)</span> <span class="op">|</span> <span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">grade</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu">mutate</span><span class="op">(</span>spread <span class="op">=</span> <span class="va">rawpoll_clinton</span><span class="op">/</span><span class="fl">100</span> <span class="op">-</span> <span class="va">rawpoll_trump</span><span class="op">/</span><span class="fl">100</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu">group_by</span><span class="op">(</span><span class="va">state</span><span class="op">)</span> <span class="op">|&gt;</span></span>
@@ -654,14 +660,14 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 <div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">results</span> <span class="op">&lt;-</span> <span class="va">results</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu">mutate</span><span class="op">(</span>sd <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">sd</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/stats/median.html">median</a></span><span class="op">(</span><span class="va">results</span><span class="op">$</span><span class="va">sd</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span>, <span class="va">sd</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To make probabilistic arguments, we will use a Monte Carlo simulation. For each state, we apply the Bayesian approach to generate an election day <span class="math inline">\(\mu\)</span>. We could construct the priors for each state based on recent history. However, to keep it simple, we assign a prior to each state that assumes we know nothing about what will happen. Since from election year to election year the results from a specific state don’t change that much, we will assign a standard deviation of 2% or <span class="math inline">\(\tau=0.02\)</span>. For now, we will assume, incorrectly, that the poll results from each state are independent. The code for the Bayesian calculation under these assumptions looks like this:</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-17_6abcdc29b2dbfc60e491e5d57563a7f1">
+<p>To make probabilistic arguments, we will use a Monte Carlo simulation. For each state, we apply the Bayesian approach to generate an election day <span class="math inline">\(\mu\)</span>. We could construct the priors for each state based on recent history. However, to keep it simple, we assign a prior to each state that assumes we know nothing about what will happen. Given that results from a specific state don’t vary significantly from election year to election year, we will assign a standard deviation of 2% or <span class="math inline">\(\tau=0.02\)</span>. For now, we will assume, incorrectly, that the poll results from each state are independent. The code for the Bayesian calculation under these assumptions looks like this:</p>
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-17_b3274e3874d3737009fcef1ac9633fb9">
 <div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mu</span> <span class="op">&lt;-</span> <span class="fl">0</span></span>
 <span><span class="va">tau</span> <span class="op">&lt;-</span> <span class="fl">0.02</span></span>
 <span><span class="va">results</span> <span class="op">|&gt;</span> <span class="fu">mutate</span><span class="op">(</span>sigma <span class="op">=</span> <span class="va">sd</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">n</span><span class="op">)</span>, </span>
-<span>                   B <span class="op">=</span> <span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">/</span> <span class="op">(</span><span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span>,</span>
-<span>                   posterior_mean <span class="op">=</span> <span class="va">B</span> <span class="op">*</span> <span class="va">mu</span> <span class="op">+</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">B</span><span class="op">)</span> <span class="op">*</span> <span class="va">avg</span>,</span>
-<span>                   posterior_se <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">1</span><span class="op">/</span> <span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="fl">1</span><span class="op">/</span><span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
+<span>                   B <span class="op">=</span> <span class="va">sigma</span><span class="op">^</span><span class="fl">2</span><span class="op">/</span><span class="op">(</span><span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span>,</span>
+<span>                   posterior_mean <span class="op">=</span> <span class="va">B</span><span class="op">*</span><span class="va">mu</span> <span class="op">+</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">B</span><span class="op">)</span><span class="op">*</span><span class="va">avg</span>,</span>
+<span>                   posterior_se <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="fl">1</span><span class="op">/</span><span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; # A tibble: 47 × 12</span></span>
 <span><span class="co">#&gt;   state          avg       sd     n electoral_votes clinton trump others</span></span>
 <span><span class="co">#&gt;   &lt;chr&gt;        &lt;dbl&gt;    &lt;dbl&gt; &lt;int&gt;           &lt;int&gt;   &lt;dbl&gt; &lt;dbl&gt;  &lt;dbl&gt;</span></span>
@@ -675,7 +681,7 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 <span><span class="co">#&gt; #   posterior_se &lt;dbl&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The estimates based on posterior do move the estimates towards 0, although the states with many polls are influenced less. This is expected as the more poll data we collect, the more we trust those results:</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/posterior-versus-original-estimates_14e0c247c5947335dfbd3d025ce9abad">
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/posterior-versus-original-estimates_ebb57e10001cdb241ca5d7fbee3d0d73">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="hierarchical-models_files/figure-html/posterior-versus-original-estimates-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -683,16 +689,16 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 </div>
 </div>
 </div>
-<p>Now we repeat this 10,000 times and generate an outcome from the posterior. In each iteration, we keep track of the total number of electoral votes for Clinton. Remember that Trump gets 270 minus the votes for Clinton. Also note that the reason we add 7 in the code is to account for Rhode Island and D.C.:</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-18_aac628927a6ab0ccd7e98fb30ac3b8df">
+<p>Now, we repeat this 10,000 times and generate an outcome from the posterior. In each iteration, we track the total number of electoral votes for Clinton. Remember that Trump gets 270 votes minus the ones for Clinton. Also, note that the reason we add 7 in the code is to account for Rhode Island and D.C.:</p>
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-18_860051014e204c02d3b9eb74947ec1db">
 <div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10000</span></span>
 <span><span class="va">mu</span> <span class="op">&lt;-</span> <span class="fl">0</span></span>
 <span><span class="va">tau</span> <span class="op">&lt;-</span> <span class="fl">0.02</span></span>
 <span><span class="va">clinton_EV</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="op">{</span></span>
 <span>  <span class="va">results</span> <span class="op">|&gt;</span> <span class="fu">mutate</span><span class="op">(</span>sigma <span class="op">=</span> <span class="va">sd</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">n</span><span class="op">)</span>, </span>
 <span>                   B <span class="op">=</span> <span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">/</span> <span class="op">(</span><span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span>,</span>
-<span>                   posterior_mean <span class="op">=</span> <span class="va">B</span> <span class="op">*</span> <span class="va">mu</span> <span class="op">+</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">B</span><span class="op">)</span> <span class="op">*</span> <span class="va">avg</span>,</span>
-<span>                   posterior_se <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">1</span> <span class="op">/</span> <span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="fl">1</span><span class="op">/</span><span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span>,</span>
+<span>                   posterior_mean <span class="op">=</span> <span class="va">B</span><span class="op">*</span><span class="va">mu</span> <span class="op">+</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">B</span><span class="op">)</span><span class="op">*</span><span class="va">avg</span>,</span>
+<span>                   posterior_se <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="fl">1</span><span class="op">/</span><span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span>,</span>
 <span>                   result <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">posterior_mean</span><span class="op">)</span>, </span>
 <span>                                  <span class="va">posterior_mean</span>, <span class="va">posterior_se</span><span class="op">)</span>,</span>
 <span>                   clinton <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">result</span> <span class="op">&gt;</span> <span class="fl">0</span>, <span class="va">electoral_votes</span>, <span class="fl">0</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
@@ -704,18 +710,19 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 <span><span class="co">#&gt; [1] 0.998</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>This model gives Clinton over 99% chance of winning. A similar prediction was made by the Princeton Election Consortium. We now know it was quite off. What happened?</p>
-<p>The model above ignores the general bias and assumes the results from different states are independent. After the election, we realized that the general bias in 2016 was not that big: it was between 1 and 2%. But because the election was close in several big states and these states had a large number of polls, pollsters that ignored the general bias greatly underestimated the standard error. Using the notation we introduce, they assumed the standard error was <span class="math inline">\(\sqrt{\sigma^2/N}\)</span> which with large N is quite smaller than the more accurate estimate <span class="math inline">\(\sqrt{\sigma^2/N + \sigma_b^2}\)</span>. FiveThirtyEight, which models the general bias in a rather sophisticated way, reported a closer result. We can simulate the results now with a bias term. For the state level, the general bias can be larger so we set it at <span class="math inline">\(\sigma_b = 0.03\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/election-forecast-posterior-with-bias, _5979a3d8ae92a8ffbc2a211ed18c68a4">
+<p>The model above ignores the general bias and assumes the results from different states are independent. After the election, we realized that the general bias in 2016 was not that big: it was between 1 and 2%. But because the election was close in several big states and these states had a large number of polls, pollsters that ignored the general bias greatly underestimated the standard error. Using the notation we introduced, they assumed the standard error was <span class="math inline">\(\sqrt{\sigma^2/N}\)</span>. With large <span class="math inline">\(N\)</span>, this estimate is substiantially closer to 0 than the more accurate estimate <span class="math inline">\(\sqrt{\sigma^2/N + \sigma_b^2}\)</span>.</p>
+<p>FiveThirtyEight, which models the general bias in a rather sophisticated way, reported a closer result. We can simulate the results now with a bias term. For the state level, the general bias can be larger so we set it at <span class="math inline">\(\sigma_b = 0.03\)</span>:</p>
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/election-forecast-posterior-with-bias, _8a9d9eda0fc986eaf1c55e56dd8caf95">
 <div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">tau</span> <span class="op">&lt;-</span> <span class="fl">0.02</span></span>
 <span><span class="va">bias_sd</span> <span class="op">&lt;-</span> <span class="fl">0.03</span></span>
 <span><span class="va">clinton_EV_2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="fl">1000</span>, <span class="op">{</span></span>
 <span>  <span class="va">results</span> <span class="op">|&gt;</span> <span class="fu">mutate</span><span class="op">(</span>sigma <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">sd</span><span class="op">^</span><span class="fl">2</span><span class="op">/</span><span class="va">n</span>  <span class="op">+</span> <span class="va">bias_sd</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span>,  </span>
-<span>                   B <span class="op">=</span> <span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">/</span> <span class="op">(</span><span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span>,</span>
-<span>                   posterior_mean <span class="op">=</span> <span class="va">B</span><span class="op">*</span><span class="va">mu</span> <span class="op">+</span> <span class="op">(</span><span class="fl">1</span><span class="op">-</span><span class="va">B</span><span class="op">)</span><span class="op">*</span><span class="va">avg</span>,</span>
-<span>                   posterior_se <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span> <span class="fl">1</span><span class="op">/</span> <span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="fl">1</span><span class="op">/</span><span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span>,</span>
+<span>                   B <span class="op">=</span> <span class="va">sigma</span><span class="op">^</span><span class="fl">2</span><span class="op">/</span><span class="op">(</span><span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span>,</span>
+<span>                   posterior_mean <span class="op">=</span> <span class="va">B</span><span class="op">*</span><span class="va">mu</span> <span class="op">+</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">B</span><span class="op">)</span><span class="op">*</span><span class="va">avg</span>,</span>
+<span>                   posterior_se <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="va">sigma</span><span class="op">^</span><span class="fl">2</span> <span class="op">+</span> <span class="fl">1</span><span class="op">/</span><span class="va">tau</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span>,</span>
 <span>                   result <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">posterior_mean</span><span class="op">)</span>, </span>
 <span>                                  <span class="va">posterior_mean</span>, <span class="va">posterior_se</span><span class="op">)</span>,</span>
-<span>                   clinton <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">result</span><span class="op">&gt;</span><span class="fl">0</span>, <span class="va">electoral_votes</span>, <span class="fl">0</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<span>                   clinton <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">result</span> <span class="op">&gt;</span> <span class="fl">0</span>, <span class="va">electoral_votes</span>, <span class="fl">0</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>    <span class="fu">summarize</span><span class="op">(</span>clinton <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">clinton</span><span class="op">)</span> <span class="op">+</span> <span class="fl">7</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>    <span class="fu">pull</span><span class="op">(</span><span class="va">clinton</span><span class="op">)</span></span>
 <span><span class="op">}</span><span class="op">)</span></span>
@@ -723,7 +730,7 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 <span><span class="co">#&gt; [1] 0.848</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>This gives us a much more sensible estimate. Looking at the outcomes of the simulation, we see how the bias term adds variability to the final results.</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/comparison-forecast-with-and-without-bias_51b850ea4b77038587059ee60703d144">
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/comparison-forecast-with-and-without-bias_cd1c38eaed715bb4599ca0b20697c6fa">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="hierarchical-models_files/figure-html/comparison-forecast-with-and-without-bias-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -732,9 +739,9 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 </div>
 </div>
 <p>FiveThirtyEight includes many other features we do not include here. One is that they model variability with distributions that have high probabilities for extreme events compared to the normal. One way we could do this is by changing the distribution used in the simulation from a normal distribution to a t-distribution. FiveThirtyEight predicted a probability of 71%.</p>
-</section><section id="forecasting" class="level2" data-number="12.6"><h2 data-number="12.6" class="anchored" data-anchor-id="forecasting">
-<span class="header-section-number">12.6</span> Forecasting</h2>
-<p>Forecasters like to make predictions well before the election. The predictions are adapted as new polls come out. However, an important question forecasters must ask is: how informative are polls taken several weeks before the election about the actual election? Here we study the variability of poll results across time.</p>
+</section><section id="forecasting" class="level2" data-number="13.6"><h2 data-number="13.6" class="anchored" data-anchor-id="forecasting">
+<span class="header-section-number">13.6</span> Forecasting</h2>
+<p>Forecasters like to make predictions well before the election. The predictions are adapted as new poll results are released. However, an important question forecasters must ask is: How informative are polls taken several weeks before the election about the actual election? Here, we study the variability of poll results across time.</p>
 <p>To make sure the variability we observe is not due to pollster effects, let’s study data from one pollster:</p>
 <div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/poplular-vote-time-trend_cc93b2f45d25a23a786f1fb18caecf92">
 <div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">one_pollster</span> <span class="op">&lt;-</span> <span class="va">polls_us_election_2016</span> <span class="op">|&gt;</span> </span>
@@ -742,11 +749,10 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 <span>  <span class="fu">mutate</span><span class="op">(</span>spread <span class="op">=</span> <span class="va">rawpoll_clinton</span><span class="op">/</span><span class="fl">100</span> <span class="op">-</span> <span class="va">rawpoll_trump</span><span class="op">/</span><span class="fl">100</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Since there is no pollster effect, then perhaps the theoretical standard error matches the data-derived standard deviation. We compute both here:</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-19_fb20f8932612cd677df4e4fceed63236">
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-19_ea14d98e92da3c6693199695af26e760">
 <div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">se</span> <span class="op">&lt;-</span> <span class="va">one_pollster</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu">summarize</span><span class="op">(</span>empirical <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span>, </span>
-<span>            theoretical <span class="op">=</span> <span class="fl">2</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span> <span class="op">*</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span><span class="op">)</span> <span class="op">/</span></span>
-<span>                                     <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">min</a></span><span class="op">(</span><span class="va">samplesize</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
+<span>            theoretical <span class="op">=</span> <span class="fl">2</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">min</a></span><span class="op">(</span><span class="va">samplesize</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">se</span></span>
 <span><span class="co">#&gt;   empirical theoretical</span></span>
 <span><span class="co">#&gt; 1    0.0403      0.0326</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -770,8 +776,8 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 </div>
 </div>
 </div>
-<p>Some of the peaks and valleys we see coincide with events such as the party conventions, which tend to give the candidate a boost. We can see the peaks and valleys are consistent across several pollsters:</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/time-trend-estimate-several-pollsters_c27dac97af4d00e8d752ca3ab8b05723">
+<p>Some of the peaks and valleys we see coincide with events such as the party conventions, which tend to give the candidate a boost. We can see that the peaks and valleys are consistent across several pollsters:</p>
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/time-trend-estimate-several-pollsters_7bbbfc7209c3d85cfdb5d4db90d6136d">
 <pre><code>#&gt; `geom_smooth()` using formula = 'y ~ x'</code></pre>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -780,9 +786,9 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 </div>
 </div>
 </div>
-<p>This implies that, if we are going to forecast, our model must include a term to accounts for the time effect. We need to write a model including a bias term for time, denote it <span class="math inline">\(b_t\)</span>. The standard deviation of <span class="math inline">\(b_t\)</span> would depend on <span class="math inline">\(t\)</span> since the closer we get to election day, the closer to 0 this bias term should be.</p>
-<p>Pollsters also try to estimate trends from these data and incorporate these into their predictions. We can model the time trend <span class="math inline">\(b_t\)</span> with a smooth function. We usually see the trend estimte not for the difference, but for the actual percentages for each candidate like this:</p>
-<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/trend-estimate-for-all-pollsters_f5f28b41350ab184704c81d4d47392d0">
+<p>This implies that if we are going to forecast, our model must include a term to accounts for the time effect. We need to write a model including a bias term for time, denoted as <span class="math inline">\(b_t\)</span>. The standard deviation of <span class="math inline">\(b_t\)</span> would depend on <span class="math inline">\(t\)</span> since the closer we get to election day, the closer to 0 this bias term should be.</p>
+<p>Pollsters also try to estimate trends from these data and incorporate them into their predictions. We can model the time trend <span class="math inline">\(b_t\)</span> with a smooth function. We usually see the trend estimate not for the difference, but for the actual percentages for each candidate like this:</p>
+<div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/trend-estimate-for-all-pollsters_d71b9ad69533e5d908230d0da51bfa36">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="hierarchical-models_files/figure-html/trend-estimate-for-all-pollsters-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -790,9 +796,9 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 </div>
 </div>
 </div>
-<p>Once a model like the one above is selected, we can use historical and present data to estimate all the necessary parameters to make predictions. There is a variety of methods for estimating trends which we discuss in the Machine Learning part.</p>
-</section><section id="exercises" class="level2" data-number="12.7"><h2 data-number="12.7" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">12.7</span> Exercises</h2>
+<p>Once a model like the one above is selected, we can use historical and present data to estimate all the necessary parameters to make predictions. There is a variety of methods for estimating trends which we discuss in the section on Machine Learning.</p>
+</section><section id="exercises" class="level2" data-number="13.7"><h2 data-number="13.7" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">13.7</span> Exercises</h2>
 <p>1. Create this table:</p>
 <div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-20_d2d73f7b02e0bf2ee53941f0331c644e">
 <div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
@@ -801,7 +807,7 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">state</span> <span class="op">!=</span> <span class="st">"U.S."</span> <span class="op">&amp;</span> <span class="va">enddate</span> <span class="op">&gt;=</span> <span class="st">"2016-10-31"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>spread <span class="op">=</span> <span class="va">rawpoll_clinton</span><span class="op">/</span><span class="fl">100</span> <span class="op">-</span> <span class="va">rawpoll_trump</span><span class="op">/</span><span class="fl">100</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Now for each poll use the CLT to create a 95% confidence interval for the spread reported by each poll. Call the resulting object cis with columns lower and upper for the limits of the confidence intervals. Use the <code>select</code> function to keep the columns <code>state, startdate, end date, pollster, grade, spread, lower, upper</code>.</p>
+<p>Now, for each poll, use the CLT to create a 95% confidence interval for the spread reported by each poll. Call the resulting object cis with columns lower and upper for the limits of the confidence intervals. Use the <code>select</code> function to keep the columns <code>state, startdate, end date, pollster, grade, spread, lower, upper</code>.</p>
 <p>2. You can add the final result to the <code>cis</code> table you just created using the <code>right_join</code> function like this:</p>
 <div class="cell" data-layout-align="center" data-hash="hierarchical-models_cache/html/unnamed-chunk-21_5d9ee9e5c06c4f219e57762adfd5cab9">
 <div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">add</span> <span class="op">&lt;-</span> <span class="va">results_us_election_2016</span> <span class="op">|&gt;</span> </span>
@@ -811,25 +817,23 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>state <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/character.html">as.character</a></span><span class="op">(</span><span class="va">state</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html">left_join</a></span><span class="op">(</span><span class="va">add</span>, by <span class="op">=</span> <span class="st">"state"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Now determine how often the 95% confidence interval includes the actual result.</p>
-<p>3. Repeat this, but show the proportion of hits for each pollster. Show only pollsters with more than 5 polls and order them from best to worst. Show the number of polls conducted by each pollster and the FiveThirtyEight grade of each pollster. Hint: use <code>n=n(), grade = grade[1]</code> in the call to summarize.</p>
+<p>Now, determine how often the 95% confidence interval includes the election night result stored in <code>actual_spread</code>.</p>
+<p>3. Repeat this, but show the proportion of hits for each pollster. Consider only pollsters with more than 5 polls and order them from best to worst. Show the number of polls conducted by each pollster and the FiveThirtyEight grade of each pollster. Hint: Use <code>n=n(), grade = grade[1]</code> in the call to summarize.</p>
 <p>4. Repeat exercise 3, but instead of pollster, stratify by state. Note that here we can’t show grades.</p>
 <p>5. Make a barplot based on the result of exercise 4. Use <code>coord_flip</code>.</p>
-<p>6. Add two columns to the <code>cis</code> table by computing, for each poll, the difference between the predicted spread and the actual spread, and define a column <code>hit</code> that is true if the signs are the same. Hint: use the function <code>sign</code>. Call the object <code>resids</code>.</p>
-<p>7. Create a plot like in exercise 5, but for the proportion of times the sign of the spread agreed.</p>
-<p>8. In exercise 7, we see that for most states the polls had it right 100% of the time. For only 9 states did the polls miss more than 25% of the time. In particular, notice that in Wisconsin every single poll got it wrong. In Pennsylvania and Michigan more than 90% of the polls had the signs wrong. Make a histogram of the errors. What is the median of these errors?</p>
-<p>9. We see that at the state level, the median error was 3% in favor of Clinton. The distribution is not centered at 0, but at 0.03. This is the general bias we described in the section above. Create a boxplot to see if the bias was general to all states or it affected some states differently. Use <code>filter(grade %in% c("A+","A","A-","B+") | is.na(grade)))</code> to only include pollsters with high grades.</p>
-<p>10. Some of these states only have a few polls. Repeat exercise 9, but only include states with 5 good polls or more. Hint: use <code>group_by</code>, <code>filter</code> then <code>ungroup</code>. You will see that the West (Washington, New Mexico, California) underestimated Hillary’s performance, while the Midwest (Michigan, Pennsylvania, Wisconsin, Ohio, Missouri) overestimated it. In our simulation, we did not model this behavior since we added general bias, rather than a regional bias. Note that some pollsters may now be modeling correlation between similar states and estimating this correlation from historical data. To learn more about this, you can learn about random effects and mixed models.</p>
-<p>11. In April 2013, José Iglesias, a professional baseball player was starting his career. He was performing exceptionally well. Specifically, he had a <em>batting average</em> (AVG) of .450. The batting average statistic is one way of measuring success. Roughly speaking, it tells us the success rate when batting. José had 9 successes out of 20 tries. An AVG of .450 means José has been successful 45% of the times he has batted which is rather high, historically speaking: no one has finished a season with an <code>AVG</code> of .400 or more since Ted Williams did it in 1941! We want to predict José’s batting average at the end of the season after players have about 500 tries or <em>at bats</em>. With the frequentist techniques we have no choice but to predict that his AVG will be .450 at the end of the season. Compute a confidence interval for the success rate.</p>
-<p>12. Despite the frequentist prediction of <span class="math inline">\(.450\)</span> not a single baseball enthusiast would make this prediction. Why is this? One reason is that they now the estimate has much uncertainty. However, the main reason is that they are implicitly using a hierarchical model that factors in information from years of following baseball. Use the following code to explore the distribution of batting averages in the three seasons prior to 2013 and describe what this tells us.</p>
-<p>13. So is José lucky or is he the best batter seen in the last 50 years? Perhaps it’s a combination of both luck and talent. But how much of each? If we become convinced that he is lucky, we should trade him to a team that trusts the .450 observation and is maybe overestimating his potential. The hierarchical model provides a mathematical description of how we came to see the observation of .450. First, we pick a player at random with an intrinsic ability summarized by, for example, <span class="math inline">\(\mu\)</span>. Then we see 20 random outcomes with success probability <span class="math inline">\(\mu\)</span>. What model would you use for the first level of your hierarchical model?</p>
-<ol start="14" type="1">
-<li>Describe the second level of the hierarchical model.</li>
-</ol>
+<p>6. Add two columns to the <code>cis</code> table by computing, for each poll, the difference between the predicted spread and the actual spread, and define a column <code>hit</code> that is true if the signs are the same. Hint: Use the function <code>sign</code>. Call the object <code>resids</code>.</p>
+<p>7. Create a plot like in exercise 5, but for the proportion of times the sign of the spread agreed with the election night result.</p>
+<p>8. In exercise 7, we see that for most states the polls had it right 100% of the time. For only 9 states did the polls miss more than 25% of the time. In particular, notice that in Wisconsin every single poll got it wrong. In Pennsylvania and Michigan, more than 90% of the polls had the signs wrong. Make a histogram of the errors. What is the median of these errors?</p>
+<p>9. We see that at the state level, the median error was 3% in favor of Clinton. The distribution is not centered at 0, but at 0.03. This related to the <em>general bias</em> described in <a href="#sec-general-bias"><span>Section&nbsp;13.2</span></a>. Create a boxplot to see if the bias was general to all states or it affected some states differently. Use <code>filter(grade %in% c("A+","A","A-","B+") | is.na(grade)))</code> to only include pollsters with high grades.</p>
+<p>10. Some of these states only have a few polls. Repeat exercise 9, but only include states with 5 good polls or more. Hint: Use <code>group_by</code>, <code>filter</code> then <code>ungroup</code>. You will see that the West (Washington, New Mexico, California) underestimated Hillary’s performance, while the Midwest (Michigan, Pennsylvania, Wisconsin, Ohio, Missouri) overestimated it. In our simulation, we did not model this behavior since we added general bias, rather than a regional bias. Note that some pollsters may now be modeling correlation between similar states and estimating this correlation from historical data. To learn more about this, you can explore concepts related to random effects and mixed models.</p>
+<p>11. In April 2013, José Iglesias, a professional baseball player was starting his career. He was performing exceptionally well, with an excellent <em>batting average</em> (AVG) of .450. The batting average statistic is one way of measuring success. Roughly speaking, it tells us the success rate when batting. José had 9 successes out of 20 tries. An AVG of .450 means José has been successful 45% of the times he has batted, which is rather high historically speaking. In fact, no one has finished a season with an <code>AVG</code> of .400 or more since Ted Williams did it in 1941! We want to predict José’s batting average at the end of the season after players have had about 500 tries or <em>at bats</em>. With the frequentist techniques, we have no choice but to predict that his AVG will be .450 at the end of the season. Compute a confidence interval for the success rate.</p>
+<p>12. Despite the frequentist prediction of <span class="math inline">\(.450\)</span>, not a single baseball enthusiast would make this prediction. Why is this? One reason is that they know the estimate has much uncertainty. However, the main reason is that they are implicitly using a hierarchical model that factors in information from years of following baseball. Use the following code to explore the distribution of batting averages in the three seasons prior to 2013, and describe what this tells us.</p>
+<p>13. So is José lucky or is he the best batter seen in the last 50 years? Perhaps it’s a combination of both luck and talent. But how much of each? If we become convinced that he is lucky, we should trade him to a team that trusts the .450 observation and is maybe overestimating his potential. The hierarchical model provides a mathematical description of how we came to see the observation of .450. First, we pick a player at random with an intrinsic ability summarized by, for example, <span class="math inline">\(\mu\)</span>. Then, we see 20 random outcomes with success probability <span class="math inline">\(\mu\)</span>. What model would you use for the first level of your hierarchical model?</p>
+<p>14. Describe the second level of the hierarchical model.</p>
 <p>15. Apply the hierarchical model to José’s data. Suppose we want to predict his innate ability in the form of his <em>true</em> batting average <span class="math inline">\(\mu\)</span>. Write down the distributions of the hierarchical model.</p>
-<p>16. We now are ready to compute a the distribution of <span class="math inline">\(\mu\)</span> conditioned on the observed data <span class="math inline">\(\bar{X}\)</span>. Compute the expected value of <span class="math inline">\(\mu\)</span> given the current average <span class="math inline">\(\bar{X}\)</span> and provide an intuitive explanation for the mathematical formula.</p>
+<p>16. We now are ready to compute a the distribution of <span class="math inline">\(\mu\)</span> conditioned on the observed data <span class="math inline">\(\bar{X}\)</span>. Compute the expected value of <span class="math inline">\(\mu\)</span> given the current average <span class="math inline">\(\bar{X}\)</span>, and provide an intuitive explanation for the mathematical formula.</p>
 <p>17. We started with a frequentist 95% confidence interval that ignored data from other players and summarized just José’s data: .450 <span class="math inline">\(\pm\)</span> 0.220. Construct a credible interval for <span class="math inline">\(\mu\)</span> based on the hierarchical model.</p>
-<p>18. The credible interval suggests that if another team is impressed by the .450 observation, we should consider trading José as we are predicting he will be just slightly above average. Interestingly, the Red Sox traded José to the Detroit Tigers in July. Here are the José Iglesias batting averages for the next five months:</p>
+<p>18. The credible interval suggests that if another team is impressed by the .450 observation, we should consider trading José, as we are predicting he will be just slightly above average. Interestingly, the Red Sox traded José to the Detroit Tigers in July. Here are José Iglesias’ batting averages for the next five months:</p>
 <table class="table">
 <thead><tr class="header">
 <th>Month</th>
@@ -882,7 +886,7 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 </tr>
 </tbody>
 </table>
-<p>Which of the two approaches provided a better prediciton?</p>
+<p>Which of the two approaches provided a better prediction?</p>
 
 
 </section><section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes"><hr>
@@ -1124,7 +1128,7 @@ <h1 class="title"><span id="sec-election-forecasting" class="quarto-section-iden
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../inference/bayes.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
diff --git a/docs/inference/hypothesis-testing.html b/docs/inference/hypothesis-testing.html
index 4feb253..3fd264f 100644
--- a/docs/inference/hypothesis-testing.html
+++ b/docs/inference/hypothesis-testing.html
@@ -61,7 +61,7 @@
 <script src="../site_libs/quarto-search/fuse.min.js"></script>
 <script src="../site_libs/quarto-search/quarto-search.js"></script>
 <meta name="quarto:offset" content="../">
-<link href="../inference/models.html" rel="next">
+<link href="../inference/bootstrap.html" rel="next">
 <link href="../inference/confidence-intervals.html" rel="prev">
 <script src="../site_libs/quarto-html/quarto.js"></script>
 <script src="../site_libs/quarto-html/popper.min.js"></script>
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link active">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -421,24 +427,24 @@ <h1 class="title"><span id="sec-hypothesis-testing" class="quarto-section-identi
 
 </header><p>In scientific studies, you’ll often see phrases like “the results are statistically significant”. This points to a technique called hypothesis testing, where we use p-values, a type of probability, to test our initial assumption or hypothesis.</p>
 <p>In hypothesis testing, rather than providing an estimate of the parameter we’re studying, we provide a probability that serves as evidence supporting or contradicting a specific hypothesis. The hypothesis usually involves whether a parameter is different from a predetermined value (often 0).</p>
-<p>Hypothesis testing is used when you can phrase your research question in terms of whether a parameter differs from this predetermined value. It’s applied in various fields, asking questions like: Does a medication extend the lives of cancer patients? Does an increase in gun sales correlate with more gun violence? Does class size affect test scores?</p>
-<p>Take for instance the previously used example with colored beads. We might not care about the exact proportion of blue beads, but instead ask: Are there more blue beads than red ones? This could be rephrased as asking if the proportion of blue beads is more than 0.5.</p>
-<p>The initial hypothesis that the parameter equals the predetermined value is called the “null hypothesis”. It’s popular because it allows us to focus on the data’s properties under this null scenario. Once data is collected, we estimate the parameter and calculate the p-value, which is the probability of the estimate being as extreme as observed if the null hypothesis is true. If the p-value is small, it indicates the null hypothesis is unlikely, offering evidence against it.</p>
-<p>We will see more examples of hypothesis testing in Chapter <a href="../linear-models/treatment-effect-models.html"><span>Chapter&nbsp;16</span></a>.</p>
+<p>Hypothesis testing is used when you can phrase your research question in terms of whether a parameter differs from this predetermined value. It’s applied in various fields, asking questions such as: Does a medication extend the lives of cancer patients? Does an increase in gun sales correlate with more gun violence? Does class size affect test scores?</p>
+<p>Take, for instance, the previously used example with colored beads. We might not be concerned about the exact proportion of blue beads, but instead ask: Are there more blue beads than red ones? This could be rephrased as asking if the proportion of blue beads is more than 0.5.</p>
+<p>The initial hypothesis that the parameter equals the predetermined value is called the “null hypothesis”. It’s popular because it allows us to focus on the data’s properties under this null scenario. Once data is collected, we estimate the parameter and calculate the p-value, which is the probability of the estimate being as extreme as observed if the null hypothesis is true. If the p-value is small, it indicates the null hypothesis is unlikely, providing evidence against it.</p>
+<p>We will see more examples of hypothesis testing in <a href="../linear-models/treatment-effect-models.html"><span>Chapter&nbsp;17</span></a>.</p>
 <section id="p-values" class="level2" data-number="9.1"><h2 data-number="9.1" class="anchored" data-anchor-id="p-values">
 <span class="header-section-number">9.1</span> p-values</h2>
 <p>Suppose we take a random sample of <span class="math inline">\(N=100\)</span> and we observe <span class="math inline">\(52\)</span> blue beads, which gives us <span class="math inline">\(\bar{X} = 0.52\)</span>. This seems to be pointing to the existence of more blue than red beads since 0.52 is larger than 0.5. However, we know there is chance involved in this process and we could get a 52 even when the actual <span class="math inline">\(p=0.5\)</span>. We call the assumption that <span class="math inline">\(p = 0.5\)</span> a <em>null hypothesis</em>. The null hypothesis is the skeptic’s hypothesis.</p>
-<p>We have observed a random variable <span class="math inline">\(\bar{X} = 0.52\)</span> and the p-value is the answer to the question: how likely is it to see a value this large, when the null hypothesis is true? If the p-value is small enough we <em>reject the null hypothesis</em> and say that the results are <em>statistically significant</em>.</p>
+<p>We have observed a random variable <span class="math inline">\(\bar{X} = 0.52\)</span>, and the p-value is the answer to the question: How likely is it to see a value this large, when the null hypothesis is true? If the p-value is small enough, we <em>reject the null hypothesis</em> and say that the results are <em>statistically significant</em>.</p>
 <div class="{callout-warning}">
-<p>The p-value of 0.05 as a threshold for statistical significance is conventionally used in many areas of research. A cutoff of 0.01 is also used to define <em>highly significance</em>. The choice of 0.05 is somewhat arbitrary and was popularized by the British statistician Ronald Fisher in the 1920s. We do not recommend using the cutoff without justification and try to avoid the phrase <em>statistically significance</em>.</p>
+<p>The p-value of 0.05 as a threshold for statistical significance is conventionally used in many areas of research. A cutoff of 0.01 is also used to define <em>highly significance</em>. The choice of 0.05 is somewhat arbitrary and was popularized by the British statistician Ronald Fisher in the 1920s. We do not recommend using these cutoff without justification and recommend avoiding the phrase <em>statistically significant</em>.</p>
 </div>
-<p>To obtain a p-value for our example we write:</p>
+<p>To obtain a p-value for our example, we write:</p>
 <p><span class="math display">\[\mbox{Pr}(\mid \bar{X} - 0.5 \mid &gt; 0.02 ) \]</span></p>
 <p>assuming the <span class="math inline">\(p=0.5\)</span>. Under the null hypothesis we know that:</p>
 <p><span class="math display">\[
 \sqrt{N}\frac{\bar{X} - 0.5}{\sqrt{0.5(1-0.5)}}
 \]</span></p>
-<p>is standard normal. We therefore can compute the probability above, which is the p-value.</p>
+<p>is standard normal. We, therefore, can compute the probability above, which is the p-value.</p>
 <p><span class="math display">\[\mbox{Pr}\left(\sqrt{N}\frac{\mid \bar{X} - 0.5\mid}{\sqrt{0.5(1-0.5)}} &gt; \sqrt{N} \frac{0.02}{ \sqrt{0.5(1-0.5)}}\right)\]</span></p>
 <div class="cell" data-layout-align="center" data-hash="hypothesis-testing_cache/html/unnamed-chunk-1_c3b75c11447e862f91310b642e4c8915">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">100</span></span>
@@ -448,53 +454,53 @@ <h1 class="title"><span id="sec-hypothesis-testing" class="quarto-section-identi
 </div>
 <p>In this case, there is actually a large chance of seeing 52 or larger under the null hypothesis.</p>
 <p>Keep in mind that there is a close connection between p-values and confidence intervals. If a 95% confidence interval of the spread does not include 0, we know that the p-value must be smaller than 0.05.</p>
-<p>To learn more about p-values, you can consult any statistics textbook. However, in general, we prefer reporting confidence intervals over p-values since it gives us an idea of the size of the estimate. If we just report the p-value we provide no information about the significance of the finding in the context of the problem.</p>
+<p>To learn more about p-values, you can consult any statistics textbook. However, in general, we prefer reporting confidence intervals over p-values because it gives us an idea of the size of the estimate. If we just report the p-value, we provide no information about the significance of the finding in the context of the problem.</p>
 <div class="{callout-warning}">
-<p>We can show mathematically that if a <span class="math inline">\((1-\alpha)\times 100\)</span>% confidence interval does not contain the null hypothesis value, the null hypothesis is rejected with a p-value as smaller or smaller than <span class="math inline">\(\alpha\)</span>. So <em>statistical significance</em> can be determined from confidence intervals. However, unlike the confidence interval, the p-value does not provide an estimate of the magnitude of the effect. For this reason we recommend avoiding p-values whenever you can compute a confidence interval.</p>
+<p>We can show mathematically that if a <span class="math inline">\((1-\alpha)\times 100\)</span>% confidence interval does not contain the null hypothesis value, the null hypothesis is rejected with a p-value as smaller or smaller than <span class="math inline">\(\alpha\)</span>. So <em>statistical significance</em> can be determined from confidence intervals. However, unlike the confidence interval, the p-value does not provide an estimate of the magnitude of the effect. For this reason, we recommend avoiding p-values whenever you can compute a confidence interval.</p>
 </div>
 </section><section id="power" class="level2" data-number="9.2"><h2 data-number="9.2" class="anchored" data-anchor-id="power">
 <span class="header-section-number">9.2</span> Power</h2>
 <p>Pollsters are not successful at providing correct confidence intervals, but rather at predicting who will win. When we took a 25 bead sample size, the confidence interval for the spread:</p>
-<div class="cell" data-layout-align="center" data-hash="hypothesis-testing_cache/html/unnamed-chunk-2_57ac577b3bcaed447ea416324742c2ac">
+<div class="cell" data-layout-align="center" data-hash="hypothesis-testing_cache/html/unnamed-chunk-2_24d1c62d28c1490521962bdf7cdd0a6a">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">25</span></span>
 <span><span class="va">x_hat</span> <span class="op">&lt;-</span> <span class="fl">0.48</span></span>
-<span><span class="op">(</span><span class="fl">2</span> <span class="op">*</span> <span class="va">x_hat</span> <span class="op">-</span> <span class="fl">1</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1.96</span>, <span class="fl">1.96</span><span class="op">)</span> <span class="op">*</span> <span class="fl">2</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">x_hat</span> <span class="op">*</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">x_hat</span><span class="op">)</span> <span class="op">/</span> <span class="va">N</span><span class="op">)</span></span>
+<span><span class="op">(</span><span class="fl">2</span><span class="op">*</span><span class="va">x_hat</span> <span class="op">-</span> <span class="fl">1</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1.96</span>, <span class="fl">1.96</span><span class="op">)</span><span class="op">*</span><span class="fl">2</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">x_hat</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">x_hat</span><span class="op">)</span><span class="op">/</span><span class="va">N</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] -0.432  0.352</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>includes 0. If this were a poll and we were forced to make a declaration, we would have to say it was a “toss-up”.</p>
-<p>A problem with our poll results is that given the sample size and the value of <span class="math inline">\(p\)</span>, we would have to sacrifice on the probability of an incorrect call to create an interval that does not include 0.</p>
-<p>This does not mean that the election is close. It only means that we have a small sample size. In statistical textbooks this is called lack of <em>power</em>. In the context of polls, <em>power</em> is the probability of detecting spreads different from 0.</p>
-<p>By increasing our sample size, we lower our standard error and therefore have a much better chance of detecting the direction of the spread.</p>
+<p>included 0. If this were a poll and we were forced to make a declaration, we would have to say it was a “toss-up”.</p>
+<p>One problem with our poll results is that, given the sample size and the value of <span class="math inline">\(p\)</span>, we would have to sacrifice the probability of an incorrect call to create an interval that does not include 0.</p>
+<p>This does not mean that the election is close. It only means that we have a small sample size. In statistical textbooks, this is called lack of <em>power</em>. In the context of polls, <em>power</em> is the probability of detecting spreads different from 0.</p>
+<p>By increasing our sample size, we lower our standard error, and thus, have a much better chance of detecting the direction of the spread.</p>
 </section><section id="exercises" class="level2" data-number="9.3"><h2 data-number="9.3" class="anchored" data-anchor-id="exercises">
 <span class="header-section-number">9.3</span> Exercises</h2>
 <ol type="1">
-<li>Generate a sample of size <span class="math inline">\(N=50\)</span> from an urn model with 50% blue beads:</li>
+<li>Generate a sample of size <span class="math inline">\(N=1000\)</span> from an urn model with 50% blue beads:</li>
 </ol>
-<div class="cell" data-layout-align="center" data-hash="hypothesis-testing_cache/html/unnamed-chunk-3_a27d7620e3832a08ca4dcafcdcf85b8f">
-<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">50</span> </span>
+<div class="cell" data-layout-align="center" data-hash="hypothesis-testing_cache/html/unnamed-chunk-3_6765ac924f733c05c43cc76ac5ea006d">
+<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
 <span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fl">0.5</span></span>
 <span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Binomial.html">rbinom</a></span><span class="op">(</span><span class="va">N</span>, <span class="fl">1</span>, <span class="fl">0.5</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Then compute a p-value testing if <span class="math inline">\(p=0.5\)</span>. Repeat this 10,000 times and report how often do we incorrectly is the p-value lower than 0.05? How often is it lower than 0.01?</p>
+<p>then, compute a p-value to test if <span class="math inline">\(p=0.5\)</span>. Repeat this 10,000 times and report how often the p-value is lower than 0.05? How often is it lower than 0.01?</p>
 <ol start="2" type="1">
-<li>Make a histogram of the p-values you generated in exercise 1. Which of the following seems to be true:</li>
+<li>Make a histogram of the p-values you generated in exercise 1. Which of the following seems to be true?</li>
 </ol>
 <ol type="a">
-<li>The p-values are all 0.05</li>
-<li>The p-values are normally distributed, CLT seems to hold.</li>
-<li>The p-values are uniformly distributed</li>
-<li>The p-values</li>
+<li>The p-values are all 0.05.</li>
+<li>The p-values are normally distributed; CLT seems to hold.</li>
+<li>The p-values are uniformly distributed.</li>
+<li>The p-values are all less than 0.05.</li>
 </ol>
 <ol start="3" type="1">
-<li><p>(Advanced) Demonstrate mathematically why see the histogram you see.</p></li>
-<li><p>Generate a sample of size <span class="math inline">\(N=50\)</span> from an urn model with 52% blue beads:</p></li>
+<li><p>Demonstrate, mathematically, why see the histogram we see in exercise 2.</p></li>
+<li><p>Generate a sample of size <span class="math inline">\(N=1000\)</span> from an urn model with 52% blue beads:</p></li>
 </ol>
-<div class="cell" data-layout-align="center" data-hash="hypothesis-testing_cache/html/unnamed-chunk-4_6ad929fe54cac1e1b6500261119919b6">
-<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">50</span> </span>
+<div class="cell" data-layout-align="center" data-hash="hypothesis-testing_cache/html/unnamed-chunk-4_787209f36d193c47658bb423c0b30889">
+<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">1000</span> </span>
 <span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fl">0.52</span></span>
 <span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Binomial.html">rbinom</a></span><span class="op">(</span><span class="va">N</span>, <span class="fl">1</span>, <span class="fl">0.5</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Then compute a p-value testing if <span class="math inline">\(p=0.5\)</span>. Repeat this 10,000 times and report how often do we incorrectly is the p-value larger than 0.05? Note that you are computing 1 - power.</p>
+<p>Compute a p-value to test if <span class="math inline">\(p=0.5\)</span>. Repeat this 10,000 times and report how often the p-value is larger than 0.05? Note that you are computing 1 - power.</p>
 <ol start="5" type="1">
 <li>Repeat exercise for but for the following values:</li>
 </ol>
@@ -742,8 +748,8 @@ <h1 class="title"><span id="sec-hypothesis-testing" class="quarto-section-identi
       </a>          
   </div>
   <div class="nav-page nav-page-next">
-      <a href="../inference/models.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span> <i class="bi bi-arrow-right-short"></i>
+      <a href="../inference/bootstrap.html" class="pagination-link">
+        <span class="nav-page-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/inference/intro-inference.html b/docs/inference/intro-inference.html
index d3a110d..34e18ab 100644
--- a/docs/inference/intro-inference.html
+++ b/docs/inference/intro-inference.html
@@ -203,23 +203,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -236,37 +242,37 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -283,31 +289,31 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -324,49 +330,49 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,7 +405,7 @@ <h1 class="title">Statistical inference</h1>
 
 </header>
 
-<p>Statistical Inference is the part of statistics that helps distinguish patterns arising from signal from those arising from chance. Statistical inference is a broad topic and here we go over the basics using polls as a motivating example. To describe the concepts, we complement the mathematical formulas with Monte Carlo simulations and R code. We motivate the concepts with election forecasting as a case study.</p>
+<p>Statistical Inference is the branch of statistics dedicated to distinguishing patterns arising from signal versus those arising from chance. It is a broad topic and, in this section, we review the basics using polls as a motivating example. To illustrate the concepts, we supplement mathematical formulas with Monte Carlo simulations and R code. We motivate the concepts with election forecasting as a case study.</p>
 <p>The day before the 2008 presidential election, Nate Silver’s FiveThirtyEight stated that “Barack Obama appears poised for a decisive electoral victory”. They went further and predicted that Obama would win the election with 349 electoral votes to 189, and the popular vote by a margin of 6.1%. FiveThirtyEight also attached a probabilistic statement to their prediction claiming that Obama had a 91% chance of winning the election. The predictions were quite accurate since, in the final results, Obama won the electoral college 365 to 173 and the popular vote by a 7.2% difference. Their performance in the 2008 election brought FiveThirtyEight to the attention of political pundits and TV personalities. Four years later, the week before the 2012 presidential election, FiveThirtyEight’s Nate Silver was giving Obama a 90% chance of winning despite many of the experts thinking the final results would be closer. Political commentator Joe Scarborough said during his show<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>:</p>
 <blockquote class="blockquote">
 <blockquote class="blockquote">
@@ -413,7 +419,7 @@ <h1 class="title">Statistical inference</h1>
 </blockquote>
 </blockquote>
 <p>In 2016, Silver was not as certain and gave Hillary Clinton only a 71% of winning. In contrast, many other forecasters were almost certain she would win. She lost. But 71% is still more than 50%, so was Mr.&nbsp;Silver wrong? And what does probability mean in this context anyway? Are dice being tossed or cards being dealt somewhere?</p>
-<p>In this part of the book, we will show how the probability concepts we learned in the previous part can be applied to develop the statistical approaches that make polls an effective tool. Although in the United States the popular vote does not determine the result of the presidential election, we will use it as an illustrative and simple example to introduce the main concepts of statistical inference. Forecasting the election is a more complex process that involves combining results from 50 states and DC and we describe it the last chapter, after we cover all the basic concepts. Specifically, we will learn the statistical concepts necessary to define <em>estimates</em> and <em>margins of errors</em> for the popular vote, and show how these are used to construct <em>confidence intervals</em>. Once we learn this, we will be able to understand <em>statistical power</em> and <em>p-values</em>, concepts that are ubiquitous in, for example, the academic literature. We then aggregate data from different pollsters to show the shortcomings of the models used by traditional pollsters and describe a way to improve the model. To understand probabilistic statements about the chances of a candidate winning, we will introduce <em>Bayesian modeling</em>. Finally, we put it all together using <em>hierarchical models</em> to recreate the simplified version of the FiveThirtyEight model and apply it to the 2016 election.</p>
+<p>In this part of the book, we will demonstrate how the probability concepts covered in the previous part can be applied to develop statistical approaches that render polls effective tools. Although in the United States the popular vote does not determine the result of the presidential election, we will use it as an illustrative and straightforward example to introduce the main concepts of statistical inference. Forecasting an election is a more complex process that involves combining results from 50 states and DC. We will delve into this subject in the last chapter, after we cover all the basic concepts. Specifically, we will learn the statistical concepts necessary to define <em>estimates</em> and <em>margins of errors</em> for the popular vote, and show how these are used to construct <em>confidence intervals</em>. Once we grasp these ideas, we will be able to understand <em>statistical power</em> and <em>p-values</em>, concepts that are ubiquitous in, for example, the academic literature. We will then aggregate data from different pollsters to highlight the shortcomings of the models used by traditional pollsters and present a method for improving these models. To understand probabilistic statements about the chances of a candidate winning, we will introduce <em>Bayesian modeling</em>. Finally, we put it all together using <em>hierarchical models</em> to recreate the simplified version of the FiveThirtyEight model and apply it to the 2016 election.</p>
 
 
 <section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes">
diff --git a/docs/inference/models.html b/docs/inference/models.html
index f311fc9..208c34a 100644
--- a/docs/inference/models.html
+++ b/docs/inference/models.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 10&nbsp; Data-driven models</title>
+<title>Advanced Data Science - 11&nbsp; Data-driven models</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -62,7 +62,7 @@
 <script src="../site_libs/quarto-search/quarto-search.js"></script>
 <meta name="quarto:offset" content="../">
 <link href="../inference/bayes.html" rel="next">
-<link href="../inference/hypothesis-testing.html" rel="prev">
+<link href="../inference/bootstrap.html" rel="prev">
 <script src="../site_libs/quarto-html/quarto.js"></script>
 <script src="../site_libs/quarto-html/popper.min.js"></script>
 <script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../inference/intro-inference.html">Statistical inference</a></li><li class="breadcrumb-item"><a href="../inference/models.html"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../inference/intro-inference.html">Statistical inference</a></li><li class="breadcrumb-item"><a href="../inference/models.html"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,21 +405,21 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#case-study-poll-aggregators" id="toc-case-study-poll-aggregators" class="nav-link active" data-scroll-target="#case-study-poll-aggregators"><span class="header-section-number">10.1</span> Case study: poll aggregators</a></li>
+<li><a href="#case-study-poll-aggregators" id="toc-case-study-poll-aggregators" class="nav-link active" data-scroll-target="#case-study-poll-aggregators"><span class="header-section-number">11.1</span> Case study: poll aggregators</a></li>
   <li>
-<a href="#sample-avg-model" id="toc-sample-avg-model" class="nav-link" data-scroll-target="#sample-avg-model"><span class="header-section-number">10.2</span> Beyond the simple sampling model</a>
+<a href="#sample-avg-model" id="toc-sample-avg-model" class="nav-link" data-scroll-target="#sample-avg-model"><span class="header-section-number">11.2</span> Beyond the simple sampling model</a>
   <ul class="collapse">
-<li><a href="#sec-population-sd" id="toc-sec-population-sd" class="nav-link" data-scroll-target="#sec-population-sd"><span class="header-section-number">10.2.1</span> Estimating the standard deviation</a></li>
-  <li><a href="#computing-a-confidence-interval" id="toc-computing-a-confidence-interval" class="nav-link" data-scroll-target="#computing-a-confidence-interval"><span class="header-section-number">10.2.2</span> Computing a confidence interval</a></li>
-  <li><a href="#sec-t-dist" id="toc-sec-t-dist" class="nav-link" data-scroll-target="#sec-t-dist"><span class="header-section-number">10.2.3</span> The t-distribution</a></li>
+<li><a href="#sec-population-sd" id="toc-sec-population-sd" class="nav-link" data-scroll-target="#sec-population-sd"><span class="header-section-number">11.2.1</span> Estimating the standard deviation</a></li>
+  <li><a href="#computing-a-confidence-interval" id="toc-computing-a-confidence-interval" class="nav-link" data-scroll-target="#computing-a-confidence-interval"><span class="header-section-number">11.2.2</span> Computing a confidence interval</a></li>
+  <li><a href="#sec-t-dist" id="toc-sec-t-dist" class="nav-link" data-scroll-target="#sec-t-dist"><span class="header-section-number">11.2.3</span> The t-distribution</a></li>
   </ul>
 </li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">10.3</span> Exercises</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">11.3</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/inference/models.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
-<h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></h1>
+<h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></h1>
 </div>
 
 
@@ -431,12 +437,12 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 <p>“All models are wrong, but some are useful.” –George E. P. Box</p>
 </blockquote>
 </blockquote>
-<p>So far, our analysis of poll-related results has been based on a simple sampling model. This model assumes that each voter has an equal chance of being selected for the poll, similar to picking beads from an urn with two colors. However, in this chapter, we explore real-world data and discover that this model is incorrect. Instead, we propose a more effective approach where we directly model the outcomes of pollsters rather than the polls themselves.</p>
-<p>A more recent development since the original invention of opinion polls, is the use of computers to aggregate publicly available data from different sources to develop data-driven forecasting models. Here we start to learn how poll aggregators collected and combined data reported by different experts to produce improved predictions. We will introduce ideas behind the statistical models used to improve election forecasts beyond the power of individual polls. Specifically, we introduce a useful model for constructing a confidence interval for the difference in popular vote.</p>
-<p>It’s important to note that this chapter only provides a glimpse into the vast realm of statistical models. For instance, the model we describe here does not allow us to assign a probability to a particular candidate winning the popular vote, as done by popular poll aggregators like FiveThirtyEight. In the next chapter, we delve into Bayesian models, which provide the mathematical framework to make such probabilistic statements. Furthermore, in Chapter #sec-linear-models, we discuss linear models, which are widely used in statistical modeling. However, these introductions are just scratching the surface, and readers interested in statistical modeling should supplement the material presented in this book with additional references.</p>
-<section id="case-study-poll-aggregators" class="level2" data-number="10.1"><h2 data-number="10.1" class="anchored" data-anchor-id="case-study-poll-aggregators">
-<span class="header-section-number">10.1</span> Case study: poll aggregators</h2>
-<p>As we described earlier, a few weeks before the 2012 election Nate Silver was giving Obama a 90% chance of winning. How was Mr.&nbsp;Silver so confident? We will use a Monte Carlo simulation to illustrate the insight Mr.&nbsp;Silver had and others missed. To do this, we generate results for 12 polls taken the week before the election. We mimic sample sizes from actual polls and construct and report 95% confidence intervals for each of the 12 polls. We save the results from this simulation in a data frame and add a poll ID column.</p>
+<p>So far, our analysis of poll-related results has been based on a simple sampling model. This model assumes that each voter has an equal chance of being selected for the poll, similar to picking beads from an urn with two colors. However, in this section, we explore real-world data and discover that this model is incorrect. Instead, we propose a more effective approach in which we directly model the outcomes of pollsters rather than the polls themselves.</p>
+<p>A more recent development since the original invention of opinion polls, is the use of computers to aggregate publicly available data from different sources and develop data-driven forecasting models. Here, we explore how poll aggregators collected and combined data reported by different experts to produce improved predictions. We will introduce ideas behind the statistical models used to improve election forecasts beyond the power of individual polls. Specifically, we introduce a useful model for constructing a confidence interval for the difference in popular vote.</p>
+<p>It’s important to note that this chapter only provides a glimpse into the vast realm of statistical models. For instance, the model we describe here does not allow us to assign a probability to a particular candidate winning the popular vote, as done by popular poll aggregators such as FiveThirtyEight. In the next section, we delve into Bayesian models, which provide the mathematical framework to make such probabilistic statements. Furthermore, in #sec-linear-models, we discuss linear models, which are widely used in statistical modeling. However, these introductions are just scratching the surface, and readers interested in statistical modeling should supplement the material presented in this book with additional references.</p>
+<section id="case-study-poll-aggregators" class="level2" data-number="11.1"><h2 data-number="11.1" class="anchored" data-anchor-id="case-study-poll-aggregators">
+<span class="header-section-number">11.1</span> Case study: poll aggregators</h2>
+<p>As we described earlier, a few weeks before the 2012 election, Nate Silver was giving Obama a 90% chance of winning. How was Mr.&nbsp;Silver so confident? We will use a Monte Carlo simulation to illustrate the insight Mr.&nbsp;Silver had, and which others missed. To do this, we generate results for 12 polls taken the week before the election. We mimic sample sizes from actual polls, construct, and report 95% confidence intervals for each of the 12 polls. We save the results from this simulation in a data frame and add a poll ID column.</p>
 <div class="cell" data-layout-align="center">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
 <span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
@@ -454,7 +460,7 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 <span>    sample_size <span class="op">=</span> <span class="va">N</span><span class="op">)</span></span>
 <span><span class="op">}</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>poll <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq_along</a></span><span class="op">(</span><span class="va">Ns</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Here is a visualization showing the intervals the pollsters would have reported for the difference between Obama and Romney:</p>
+<p>Here is a visualization showing the intervals that the pollsters would have reported for the difference between Obama and Romney:</p>
 <div class="cell" data-layout-align="center" data-hash="models_cache/html/simulated-polls_de5b49b434cf03ea92f1a7299463c637">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -463,9 +469,9 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 </div>
 </div>
 </div>
-<p>Not surprisingly, all 12 polls report confidence intervals that include the election night result (dashed line). However, all 12 polls also include 0 (solid black line) as well. Therefore, if asked individually for a prediction, the pollsters would have to say: it’s a toss-up. Below we describe a key insight they are missing.</p>
-<p>Poll aggregators, such as Nate Silver, realized that by combining the results of different polls you could greatly improve precision. By doing this, we are effectively conducting a poll with a huge sample size. We can therefore report a smaller 95% confidence interval and a more precise prediction.</p>
-<p>Although as aggregators we do not have access to the raw poll data, we can use mathematics to reconstruct what we would have obtained had we made one large poll with:</p>
+<p>Not surprisingly, all 12 polls report confidence intervals that include the election night result (dashed line). However, all 12 polls also include 0 (solid black line) as well. Therefore, if asked individually for a prediction, the pollsters would have to say: it’s a toss-up. Below, we describe a key insight they are missing.</p>
+<p>Poll aggregators, such as Nate Silver, realized that by combining the results of different polls you could greatly improve precision. By doing this, we are effectively conducting a poll with a huge sample size. We can, therefore, report a smaller 95% confidence interval and a more precise prediction.</p>
+<p>Although, as aggregators, we do not have access to the raw poll data, we can use mathematics to reconstruct what we would have obtained had we made one large poll with:</p>
 <div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-3_c6b46cfa15abfb2182adad2b60852287">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">polls</span><span class="op">$</span><span class="va">sample_size</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 11269</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -477,7 +483,7 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">avg</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Once we have an estimate of <span class="math inline">\(\mu\)</span>, we can construct an estimate for the proportion voting for Obama, which we can then use to estimate the standard error. Once we do this, we see that our margin of error is 0.0184545.</p>
-<p>Thus, we can predict that the spread will be 3.1 plus or minus 1.8, which not only includes the actual result we eventually observed on election night, but is quite far from including 0. Once we combine the 12 polls, we become quite certain that Obama will win the popular vote.</p>
+<p>Thus, we can predict that the spread will be 3.1, plus or minus 1.8, which not only includes the actual result we eventually observed on election night, but is quite far from including 0. Once we combine the 12 polls, we become quite certain that Obama will win the popular vote.</p>
 <div class="cell" data-layout-align="center" data-hash="models_cache/html/confidence-coverage-2008-election_15816cbf895bea9657e51ebc0fc024a7">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -486,7 +492,7 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 </div>
 </div>
 </div>
-<p>Howevever, this was just a simulation to illustrate the idea. Let’s look at real data from 2016 presidential election. Specifically, the following subset of the <code>polls_us_election_2016</code> data in <strong>dslabs</strong> which includes results for national polls, as well as state polls, taken during the year prior to the election and organized by FiveThirtyEight. For this first example, we will filter the data to include national polls conducted during the week before the election. We also remove polls that FiveThirtyEight has determined not to be reliable and graded with a “B” or less. Some polls have not been graded and we include those:</p>
+<p>However, this was just a simulation to illustrate the idea. Let’s look at real data from the 2016 presidential election. Specifically, the following subset of the <code>polls_us_election_2016</code> data in <strong>dslabs</strong> includes results for national polls as well as state polls taken during the year prior to the election and organized by FiveThirtyEight. For this first example, we will filter the data to include national polls conducted during the week before the election. We also remove polls that FiveThirtyEight has determined to be unreliable and graded with a “B” or less. Some polls have not been graded, and we include those:</p>
 <div class="cell" data-layout-align="center">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
 <span><span class="va">polls</span> <span class="op">&lt;-</span> <span class="va">polls_us_election_2016</span> <span class="op">|&gt;</span> </span>
@@ -498,22 +504,22 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls</span> <span class="op">&lt;-</span> <span class="va">polls</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>spread <span class="op">=</span> <span class="va">rawpoll_clinton</span><span class="op">/</span><span class="fl">100</span> <span class="op">-</span> <span class="va">rawpoll_trump</span><span class="op">/</span><span class="fl">100</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>For this example, we will assume that there are only two parties and call <span class="math inline">\(p\)</span> the proportion voting for Clinton and <span class="math inline">\(1-p\)</span> the proportion voting for Trump. We are interested in the spread <span class="math inline">\(2p-1\)</span>. Let’s call the spread <span class="math inline">\(\mu\)</span> (for difference).</p>
+<p>For this example, we will assume that there are only two parties, and call <span class="math inline">\(p\)</span> the proportion voting for Clinton and <span class="math inline">\(1-p\)</span> the proportion voting for Trump. We are interested in the spread <span class="math inline">\(2p-1\)</span>. Let’s call the spread <span class="math inline">\(\mu\)</span> (for difference).</p>
 <p>We have 49 estimates of the spread. The theory we learned from sampling models tells us that these estimates are a random variable with a probability distribution that is approximately normal. The expected value is the election night spread <span class="math inline">\(\mu\)</span> and the standard error is <span class="math inline">\(2\sqrt{p (1 - p) / N}\)</span>. Assuming the urn model we described earlier is a good one, we can use this information to construct a confidence interval based on the aggregated data. The estimated spread is:</p>
-<div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-7_3fbdca9fc122ba831550ea9f9aa84344">
+<div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-7_7d442e1e8f92531215a05bc618278eb5">
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mu_hat</span> <span class="op">&lt;-</span> <span class="va">polls</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>mu_hat <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">spread</span> <span class="op">*</span> <span class="va">samplesize</span><span class="op">)</span> <span class="op">/</span> <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">samplesize</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>mu_hat <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">spread</span><span class="op">*</span><span class="va">samplesize</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">samplesize</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">mu_hat</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>and the standard error is:</p>
-<div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-8_2b607d4596802fa5e2e238f2c69cd424">
+<div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-8_da36d66669ffa64822fd0a54a360d643">
 <div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p_hat</span> <span class="op">&lt;-</span> <span class="op">(</span><span class="va">mu_hat</span> <span class="op">+</span> <span class="fl">1</span><span class="op">)</span><span class="op">/</span><span class="fl">2</span> </span>
-<span><span class="va">moe</span> <span class="op">&lt;-</span> <span class="fl">1.96</span> <span class="op">*</span> <span class="fl">2</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">p_hat</span> <span class="op">*</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p_hat</span><span class="op">)</span> <span class="op">/</span> <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">polls</span><span class="op">$</span><span class="va">samplesize</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">moe</span> <span class="op">&lt;-</span> <span class="fl">1.96</span><span class="op">*</span><span class="fl">2</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">p_hat</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p_hat</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">polls</span><span class="op">$</span><span class="va">samplesize</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">moe</span></span>
 <span><span class="co">#&gt; [1] 0.00662</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>So we report a spread of 1.43% with a margin of error of 0.66%. On election night, we discover that the actual percentage was 2.1%, which is outside a 95% confidence interval. What happened?</p>
-<p>A histogram of the reported spreads shows a problem:</p>
+<p>A histogram of the reported spreads reveals a problem:</p>
 <div class="cell" data-layout-align="center" data-hash="models_cache/html/polls-2016-spread-histogram_fb7cf990d59e6fc5e2de7db760bcb25b">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_histogram.html">geom_histogram</a></span><span class="op">(</span>color <span class="op">=</span> <span class="st">"black"</span>, binwidth <span class="op">=</span> <span class="fl">.01</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
@@ -523,10 +529,10 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 </div>
 </div>
 </div>
-<p>The data does not appear to be normally distributed and the standard error appears to be larger than 0.0066232. The theory is not working here and in the next section we describe a useful data-driven model.</p>
-</section><section id="sample-avg-model" class="level2" data-number="10.2"><h2 data-number="10.2" class="anchored" data-anchor-id="sample-avg-model">
-<span class="header-section-number">10.2</span> Beyond the simple sampling model</h2>
-<p>Notice that data come various pollsters and some are taking several polls a week:</p>
+<p>The data does not appear to be normally distributed, and the standard error appears to be larger than 0.0066232. The theory is not working here, and in the next section, we describe a useful data-driven model.</p>
+</section><section id="sample-avg-model" class="level2" data-number="11.2"><h2 data-number="11.2" class="anchored" data-anchor-id="sample-avg-model">
+<span class="header-section-number">11.2</span> Beyond the simple sampling model</h2>
+<p>Notice that data come from various pollsters, and some are taking several polls a week:</p>
 <div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-9_e855b17a9df2f25742a326f60f899f0c">
 <div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">pollster</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/context.html">n</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; # A tibble: 15 × 2</span></span>
@@ -549,10 +555,10 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 </div>
 </div>
 <p>This plot reveals an unexpected result. First, consider that the standard error predicted by theory for each poll is between 0.018 and 0.033:</p>
-<div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-10_c9ba8e473367498f7b43519d199ea8c5">
+<div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-10_097c2e4083527f0011845712391db709">
 <div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">pollster</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/context.html">n</a></span><span class="op">(</span><span class="op">)</span> <span class="op">&gt;=</span> <span class="fl">6</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>se <span class="op">=</span> <span class="fl">2</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">p_hat</span> <span class="op">*</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p_hat</span><span class="op">)</span> <span class="op">/</span> <span class="fu"><a href="https://rdrr.io/r/stats/median.html">median</a></span><span class="op">(</span><span class="va">samplesize</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>se <span class="op">=</span> <span class="fl">2</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">p_hat</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p_hat</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/stats/median.html">median</a></span><span class="op">(</span><span class="va">samplesize</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; # A tibble: 5 × 2</span></span>
 <span><span class="co">#&gt;   pollster                     se</span></span>
 <span><span class="co">#&gt;   &lt;fct&gt;                     &lt;dbl&gt;</span></span>
@@ -562,7 +568,8 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 <span><span class="co">#&gt; 4 The Times-Picayune/Lucid 0.0196</span></span>
 <span><span class="co">#&gt; 5 USC Dornsife/LA Times    0.0183</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This agrees with the within poll variation we see. However, there appears to be differences <em>across the polls</em>. Note, for example, how the USC Dornsife/LA Times pollster is predicting a 4%&nbsp;win for Trump, while Ipsos is predicting a win larger than 5% for Clinton. The theory we learned says nothing about different pollsters producing polls with different expected values: all the polls should have the same expected value. FiveThirtyEight refers to these differences as <em>house effects</em>. We also call them <em>pollster bias</em>. Nothing in our simple urn model provides an explanation for these pollster-to-pollster differences. This model misspesification led to an overconfident interval that ended up not inclding the election nigth result. So, rather than modeling the process generating these values with an urn model, we instead model the pollster results directly. To do this, we start by collecting some data. Specifically, for each pollster we look at the last reported result before the election:</p>
+<p>This agrees with the within poll variation we see. However, there appears to be differences <em>across the polls</em>. Observe, for example, how the USC Dornsife/LA Times pollster is predicting a 4% lead for Trump, while Ipsos is predicting a lead larger than 5% for Clinton. The theory we learned says nothing about different pollsters producing polls with different expected values, instead it assumes all the polls have the same expected value. FiveThirtyEight refers to these differences as <em>house effects</em>. We also call them <em>pollster bias</em>. Nothing in our simple urn model provides an explanation for these pollster-to-pollster differences.</p>
+<p>This model misspecification led to an overconfident interval that ended up not including the election night result. So, rather than modeling the process generating these values with an urn model, we instead model the pollster results directly. To do this, we start by collecting some data. Specifically, for each pollster, we look at the last reported result before the election:</p>
 <div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-11_916ca1113b22d3f71ad9a2b1bd01993f">
 <div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">one_poll_per_pollster</span> <span class="op">&lt;-</span> <span class="va">polls</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">pollster</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">enddate</span> <span class="op">==</span> <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="va">enddate</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
@@ -579,46 +586,46 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 </div>
 </div>
 </div>
-<p>Although we are no longer using a model with red (Repbulicans) and blue (Democrate) beads in an urn, our new model can also be thought of as an urn mode but containing poll results from all possible pollsters and think of our $N=$15 data points <span class="math inline">\(X_1,\dots X_N\)</span> a as a random sample from this urn. To develop a useful model, we <em>assume</em> that the expected value of our urn is the actual spread <span class="math inline">\(\mu=2p-1\)</span>, which implies that the sample average has expected value <span class="math inline">\(\mu\)</span>.</p>
+<p>Although we are no longer using a model with red (Republicans) and blue (Democrats) beads in an urn, our new model can also be thought of as an urn model, but containing poll results from all possible pollsters. Think of our $N=$15 data points <span class="math inline">\(X_1,\dots X_N\)</span> as a random sample from this urn. To develop a useful model, we <em>assume</em> that the expected value of our urn is the actual spread <span class="math inline">\(\mu=2p-1\)</span>, which implies that the sample average has expected value <span class="math inline">\(\mu\)</span>.</p>
 <p>Now, because instead of 0s and 1s, our urn contains continuous numbers, the standard deviation of the urn is no longer <span class="math inline">\(\sqrt{p(1-p)}\)</span>. Rather than voter sampling variability, the standard error now includes the pollster-to-pollster variability. Our new urn also includes the sampling variability from the polling. Regardless, this standard deviation is now an unknown parameter. In statistics textbooks, the Greek symbol <span class="math inline">\(\sigma\)</span> is used to represent this parameter.</p>
-<p>So our new statistical model is that <span class="math inline">\(X_1, \dots, X_N\)</span> are a random sample with expected <span class="math inline">\(\mu\)</span> and standard deviation <span class="math inline">\(\sigma\)</span>. The distribution, for now, is unspecified. But we consider <span class="math inline">\(N\)</span> to be large enough to assume that the sample average <span class="math inline">\(\bar{X} = \sum_{i=1}^N X_i\)</span> follows a normal distribution with expected value <span class="math inline">\(\mu\)</span> and standard error <span class="math inline">\(\sigma / \sqrt{N}\)</span>. We write</p>
+<p>So our new statistical model is that <span class="math inline">\(X_1, \dots, X_N\)</span> are a random sample with expected <span class="math inline">\(\mu\)</span> and standard deviation <span class="math inline">\(\sigma\)</span>. The distribution, for now, is unspecified. But we consider <span class="math inline">\(N\)</span> to be large enough to assume that the sample average <span class="math inline">\(\bar{X} = \sum_{i=1}^N X_i\)</span> follows a normal distribution with expected value <span class="math inline">\(\mu\)</span> and standard error <span class="math inline">\(\sigma / \sqrt{N}\)</span>. We write:</p>
 <p><span class="math display">\[
 \bar{X} \sim \mbox{N}(\mu, \sigma / \sqrt{N})
-\]</span> Here the <span class="math inline">\(\sim\)</span> symbol tells us the random variable on the left of the symbol follows the distribution on the right. We use the notation <span class="math inline">\(N(a,b)\)</span> to represent the normal distribution with mean <span class="math inline">\(a\)</span> and standard deviation <span class="math inline">\(b\)</span>.</p>
+\]</span> Here the <span class="math inline">\(\sim\)</span> symbol tells us that the random variable on the left of the symbol follows the distribution on the right. We use the notation <span class="math inline">\(N(a,b)\)</span> to represent the normal distribution with mean <span class="math inline">\(a\)</span> and standard deviation <span class="math inline">\(b\)</span>.</p>
 <div class="{callout-important}">
 <p>This model for the sample average will be used again the next chapter.</p>
 </div>
-<section id="sec-population-sd" class="level3" data-number="10.2.1"><h3 data-number="10.2.1" class="anchored" data-anchor-id="sec-population-sd">
-<span class="header-section-number">10.2.1</span> Estimating the standard deviation</h3>
-<p>The model we have specfied has two unknown parameters: the expected value <span class="math inline">\(\mu\)</span> and the standard deviation <span class="math inline">\(\sigma\)</span>. We know that the sample average <span class="math inline">\(\bar{X}\)</span> will be our estimte of <span class="math inline">\(\mu\)</span>. But what about <span class="math inline">\(\sigma\)</span>?</p>
-<p>Our task is to estimate <span class="math inline">\(\mu\)</span>. Because we model the observed values <span class="math inline">\(X_1,\dots X_N\)</span> as a random sample from the urn, for a large enough sample size <span class="math inline">\(N\)</span>, the probability distribution of the sample average <span class="math inline">\(\bar{X}\)</span> is approximately normal with expected value <span class="math inline">\(\mu\)</span> and standard error <span class="math inline">\(\sigma/\sqrt{N}\)</span>. If we are willing to consider <span class="math inline">\(N=15\)</span> large enough, we can use this to construct confidence intervals.</p>
-<p>Theory tells us that we can estimate the urn model <span class="math inline">\(\sigma\)</span> with the <em>sample standard deviation</em> defined as</p>
+<section id="sec-population-sd" class="level3" data-number="11.2.1"><h3 data-number="11.2.1" class="anchored" data-anchor-id="sec-population-sd">
+<span class="header-section-number">11.2.1</span> Estimating the standard deviation</h3>
+<p>The model we have specified has two unknown parameters: the expected value <span class="math inline">\(\mu\)</span> and the standard deviation <span class="math inline">\(\sigma\)</span>. We know that the sample average <span class="math inline">\(\bar{X}\)</span> will be our estimate of <span class="math inline">\(\mu\)</span>. But what about <span class="math inline">\(\sigma\)</span>?</p>
+<p>Our task is to estimate <span class="math inline">\(\mu\)</span>. Given that we model the observed values <span class="math inline">\(X_1,\dots X_N\)</span> as a random sample from the urn, for a large enough sample size <span class="math inline">\(N\)</span>, the probability distribution of the sample average <span class="math inline">\(\bar{X}\)</span> is approximately normal with expected value <span class="math inline">\(\mu\)</span> and standard error <span class="math inline">\(\sigma/\sqrt{N}\)</span>. If we are willing to consider <span class="math inline">\(N=15\)</span> large enough, we can use this to construct confidence intervals.</p>
+<p>Theory tells us that we can estimate the urn model <span class="math inline">\(\sigma\)</span> with the <em>sample standard deviation</em> defined as:</p>
 <p><span class="math display">\[
 s = \sqrt{ \frac{1}{N-1} \sum_{i=1}^N (X_i - \bar{X})^2 }
 \]</span></p>
-<p>Note that unlike for the population standard deviation definition, we now divide by <span class="math inline">\(N-1\)</span>. This makes <span class="math inline">\(s\)</span> a better estimate of <span class="math inline">\(\sigma\)</span>. There is a mathematical explanation for this, which is explained in most statistics textbooks, but we don’t cover it here.</p>
+<p>Keep in mind that, unlike for the population standard deviation definition, we now divide by <span class="math inline">\(N-1\)</span>. This makes <span class="math inline">\(s\)</span> a better estimate of <span class="math inline">\(\sigma\)</span>. There is a mathematical explanation for this, which is explained in most statistics textbooks, but we do not cover it here.</p>
 <p>The <code>sd</code> function in R computes the sample standard deviation:</p>
 <div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-12_132c2acf08789a825abb519d4b06435c">
 <div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">one_poll_per_pollster</span><span class="op">$</span><span class="va">spread</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.0242</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="computing-a-confidence-interval" class="level3" data-number="10.2.2"><h3 data-number="10.2.2" class="anchored" data-anchor-id="computing-a-confidence-interval">
-<span class="header-section-number">10.2.2</span> Computing a confidence interval</h3>
+</section><section id="computing-a-confidence-interval" class="level3" data-number="11.2.2"><h3 data-number="11.2.2" class="anchored" data-anchor-id="computing-a-confidence-interval">
+<span class="header-section-number">11.2.2</span> Computing a confidence interval</h3>
 <p>We are now ready to form a new confidence interval based on our new data-driven model:</p>
-<div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-13_b3eee2e4959082315f5cec9a820f9aa1">
+<div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-13_64e13c08f41d9687b1cfc5bc9480081f">
 <div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">results</span> <span class="op">&lt;-</span> <span class="va">one_poll_per_pollster</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>avg <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span>, </span>
-<span>            se <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span> <span class="op">/</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>start <span class="op">=</span> <span class="va">avg</span> <span class="op">-</span> <span class="fl">1.96</span> <span class="op">*</span> <span class="va">se</span>, </span>
-<span>         end <span class="op">=</span> <span class="va">avg</span> <span class="op">+</span> <span class="fl">1.96</span> <span class="op">*</span> <span class="va">se</span><span class="op">)</span> </span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">results</span> <span class="op">*</span> <span class="fl">100</span>, <span class="fl">1</span><span class="op">)</span></span>
+<span>            se <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">spread</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>start <span class="op">=</span> <span class="va">avg</span> <span class="op">-</span> <span class="fl">1.96</span><span class="op">*</span><span class="va">se</span>, </span>
+<span>         end <span class="op">=</span> <span class="va">avg</span> <span class="op">+</span> <span class="fl">1.96</span><span class="op">*</span><span class="va">se</span><span class="op">)</span> </span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">results</span><span class="op">*</span><span class="fl">100</span>, <span class="fl">1</span><span class="op">)</span></span>
 <span><span class="co">#&gt;   avg  se start end</span></span>
 <span><span class="co">#&gt; 1 2.9 0.6   1.7 4.1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Our confidence interval is wider now since it incorporates the pollster variability. It does include the election night result of 2.1%. Also, note that it was small enough not to include 0, which means we were confident Clinton would win the popular vote.</p>
-</section><section id="sec-t-dist" class="level3" data-number="10.2.3"><h3 data-number="10.2.3" class="anchored" data-anchor-id="sec-t-dist">
-<span class="header-section-number">10.2.3</span> The t-distribution</h3>
-<p>Above we made use of the CLT with a sample size of 15. Because we are estimating a second parameters <span class="math inline">\(\sigma\)</span>, further variability is introduced into our confidence interval which results in intervals that are too small. For very large sample sizes this extra variability is negligible, but, in general, for values smaller than 30 we need to be cautious about using the CLT. However, if the data in the urn is known to follow a normal distribution, then we actually have mathematical theory that tells us how much bigger we need to make the intervals to account for the estimation of <span class="math inline">\(\sigma\)</span>. Using this theory, we can construct confidence intervals for any <span class="math inline">\(N\)</span>. But again, this works only if <strong>the data in the urn is known to follow a normal distribution</strong>. So for the 0, 1 data of our previous urn model, this theory definitely does not apply.</p>
+</section><section id="sec-t-dist" class="level3" data-number="11.2.3"><h3 data-number="11.2.3" class="anchored" data-anchor-id="sec-t-dist">
+<span class="header-section-number">11.2.3</span> The t-distribution</h3>
+<p>Above, we made use of the CLT with a sample size of 15. Because we are estimating a second parameters <span class="math inline">\(\sigma\)</span>, further variability is introduced into our confidence interval, which results in intervals that are too small. For very large sample sizes, this extra variability is negligible, but in general, for values smaller than 30, we need to be cautious about using the CLT. However, if the data in the urn is known to follow a normal distribution, then we actually have mathematical theory that tells us how much bigger we need to make the intervals to account for the estimation of <span class="math inline">\(\sigma\)</span>. Applying this theory, we can construct confidence intervals for any <span class="math inline">\(N\)</span>. But again, this works only if <strong>the data in the urn is known to follow a normal distribution</strong>. So for the 0, 1 data of our previous urn model, this theory definitely does not apply.</p>
 <div class="callout callout-style-simple callout-warning">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
@@ -629,16 +636,16 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 </div>
 </div>
 </div>
-<p>The statistic on which confidence intervals for <span class="math inline">\(\mu\)</span> are based is</p>
+<p>The statistic on which confidence intervals for <span class="math inline">\(\mu\)</span> are based is:</p>
 <p><span class="math display">\[
 Z = \frac{\bar{X} - \mu}{\sigma/\sqrt{N}}
 \]</span></p>
-<p>CLT tells us that Z is approximately normally distributed with expected value 0 and standard error 1. But in practice we don’t know <span class="math inline">\(\sigma\)</span> so we use:</p>
+<p>CLT tells us that Z is approximately normally distributed with expected value 0 and standard error 1. But in practice we don’t know <span class="math inline">\(\sigma\)</span>, so we use:</p>
 <p><span class="math display">\[
 t = \frac{\bar{X} - \mu}{s/\sqrt{N}}
 \]</span></p>
-<p>This is referred to a <em>t-statistic</em>. By substituting <span class="math inline">\(\sigma\)</span> with <span class="math inline">\(s\)</span> we introduce some variability. The theory tells us that <span class="math inline">\(t\)</span> follows a <em>student t-distribution</em> with <span class="math inline">\(N-1\)</span> <em>degrees of freedom</em>. The degrees of freedom is a parameter that controls the variability via fatter tails:</p>
-<div class="cell" data-layout-align="center" data-hash="models_cache/html/t-distribution-examples_8595a264f504646963da9bc3c770ade7">
+<p>This is referred to as a <em>t-statistic</em>. By substituting <span class="math inline">\(\sigma\)</span> with <span class="math inline">\(s\)</span>, we introduce some variability. The theory tells us that <span class="math inline">\(t\)</span> follows a <em>student t-distribution</em> with <span class="math inline">\(N-1\)</span> <em>degrees of freedom</em>. The degrees of freedom is a parameter that controls the variability via fatter tails:</p>
+<div class="cell" data-layout-align="center" data-hash="models_cache/html/t-distribution-examples_fcd5f0e2f63f152474ebb6eaff06f0f8">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="models_files/figure-html/t-distribution-examples-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -668,23 +675,23 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 <span><span class="co">#&gt;    &lt;dbl&gt;  &lt;dbl&gt;  &lt;dbl&gt;  &lt;dbl&gt;</span></span>
 <span><span class="co">#&gt; 1 0.0290 0.0134 0.0156 0.0424</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>A bit larger than the one using normal is</p>
+<p>A bit larger than the one using normal is:</p>
 <div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-16_47f98b4e608b5fac18480cd293e1f92e">
 <div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/TDist.html">qt</a></span><span class="op">(</span><span class="fl">0.975</span>, <span class="fl">14</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 2.14</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>is bigger than</p>
+<p>is bigger than:</p>
 <div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-17_bf5cfa3a9f40013078bb8cb01fc9b3ee">
 <div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">qnorm</a></span><span class="op">(</span><span class="fl">0.975</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 1.96</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This results in slightly larger confidence interval than we obtained before:</p>
-<div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-18_57730119f28e6821555084c448794b3f">
+<p>This results in a slightly larger confidence interval than we obtained before:</p>
+<div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-18_2c8b3b6a9b23cde70fe69735ffed5631">
 <pre><code>#&gt;   start end
 #&gt; 1   1.6 4.2</code></pre>
 </div>
-<p>Note that using the t-distribution and the t-statistic is the basis for <em>t-tests</em>, widely used approach for computing p-values. To learn more about t-tests, you can consult any statistics textbook.</p>
-<p>The t-distribution can also be used to model errors in bigger deviations that are more likely than with the normal distribution, as seen in the densities we previously saw. Fivethirtyeight uses the t-distribution to generate errors that better model the deviations we see in election data. For example, in Wisconsin the average of six polls was 7% in favor of Clinton with a standard deviation of 1%, but Trump won by 0.7%. Even after taking into account the overall bias, this 7.7% residual is more in line with t-distributed data than the normal distribution.</p>
+<p>Note that using the t-distribution and the t-statistic is the basis for <em>t-tests</em>, a widely used approach for computing p-values. To learn more about t-tests, you can consult any statistics textbook.</p>
+<p>The t-distribution can also be used to model errors in bigger deviations that are more likely than with the normal distribution, as seen in the densities we previously observed. FiveThirtyEight uses the t-distribution to generate errors that better model the deviations we see in election data. For example, in Wisconsin, the average of six polls was 7% in favor of Clinton with a standard deviation of 1%, but Trump won by 0.7%. Even after taking into account the overall bias, this 7.7% residual is more in line with t-distributed data than the normal distribution.</p>
 <div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-19_ab09f06b708201d174ffa46b99363988">
 <div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls_us_election_2016</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">state</span> <span class="op">==</span> <span class="st">"Wisconsin"</span> <span class="op">&amp;</span></span>
@@ -700,15 +707,16 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 <span><span class="co">#&gt;   actual    avg     sd n</span></span>
 <span><span class="co">#&gt; 1 -0.007 0.0711 0.0104 6</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section></section><section id="exercises" class="level2" data-number="10.3"><h2 data-number="10.3" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">10.3</span> Exercises</h2>
-<p>We have been using urn models to motivate the use of probability models. Most data science applications are not related to data obtained from urns. More common are data that come from individuals. The reason probability plays a role here is because the data come from a random sample. The random sample is taken from a population and the urn serves as an analogy for the population.</p>
-<p>Let’s revisit the heights dataset. Suppose we consider the males in our course the population.</p>
+</section></section><section id="exercises" class="level2" data-number="11.3"><h2 data-number="11.3" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">11.3</span> Exercises</h2>
+<p>We have been using urn models to motivate the use of probability models. Yet, most data science applications are not related to data obtained from urns. More common are data that come from individuals. The reason probability plays a role here is because the data come from a random sample. The random sample is taken from a population, and the urn serves as an analogy for the population.</p>
+<p>Define the males that replied to the height survey as the population</p>
 <div class="cell" data-layout-align="center" data-hash="models_cache/html/unnamed-chunk-20_d5e173e941257c146f9228bbd468c5f1">
 <div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
 <span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">sex</span> <span class="op">==</span> <span class="st">"Male"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
+<p>to answer the following questions.</p>
 <p>1. Mathematically speaking, <code>x</code> is our population. Using the urn analogy, we have an urn with the values of <code>x</code> in it. What are the average and standard deviation of our population?</p>
 <p>2. Call the population average computed above <span class="math inline">\(\mu\)</span> and the standard deviation <span class="math inline">\(\sigma\)</span>. Now take a sample of size 50, with replacement, and construct an estimate for <span class="math inline">\(\mu\)</span> and <span class="math inline">\(\sigma\)</span>.</p>
 <p>3. What does the theory tell us about the sample average <span class="math inline">\(\bar{X}\)</span> and how it is related to <span class="math inline">\(\mu\)</span>?</p>
@@ -718,7 +726,7 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
 <li>It is a random variable with expected value <span class="math inline">\(\mu\)</span> and standard error <span class="math inline">\(\sigma\)</span>.</li>
 <li>Contains no information.</li>
 </ol>
-<p>4. So how is this useful? We are going to use an oversimplified yet illustrative example. Suppose we want to know the average height of our male students, but we only get to measure 50 of the 708. We will use <span class="math inline">\(\bar{X}\)</span> as our estimate. We know from the answer to exercise 3 that the standard estimate of our error <span class="math inline">\(\bar{X}-\mu\)</span> is <span class="math inline">\(\sigma/\sqrt{N}\)</span>. We want to compute this, but we don’t know <span class="math inline">\(\sigma\)</span>. Based on what is described in this section, show your estimate of <span class="math inline">\(\sigma\)</span>.</p>
+<p>4. So, how is this useful? We are going to use an oversimplified yet illustrative example. Suppose we want to know the average height of our male students, but we can only measure 50 of the 708. We will use <span class="math inline">\(\bar{X}\)</span> as our estimate. We know from the answer to exercise 3 that the standard estimate of our error <span class="math inline">\(\bar{X}-\mu\)</span> is <span class="math inline">\(\sigma/\sqrt{N}\)</span>. We want to compute this, but we don’t know <span class="math inline">\(\sigma\)</span>. Based on what is described in this section, show your estimate of <span class="math inline">\(\sigma\)</span>.</p>
 <p>5. Now that we have an estimate of <span class="math inline">\(\sigma\)</span>, let’s call our estimate <span class="math inline">\(s\)</span>. Construct a 95% confidence interval for <span class="math inline">\(\mu\)</span>.</p>
 <p>6. Now run a Monte Carlo simulation in which you compute 10,000 confidence intervals as you have just done. What proportion of these intervals include <span class="math inline">\(\mu\)</span>?</p>
 <p>7. Use the <code>qnorm</code> and <code>qt</code> functions to generate quantiles. Compare these quantiles for different degrees of freedom for the t-distribution. Use this to motivate the sample size of 30 rule of thumb.</p>
@@ -957,13 +965,13 @@ <h1 class="title"><span id="sec-models" class="quarto-section-identifier"><span
   }
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
-      <a href="../inference/hypothesis-testing.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span>
+      <a href="../inference/bootstrap.html" class="pagination-link">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../inference/bayes.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/inference/parameters-estimates.html b/docs/inference/parameters-estimates.html
index a19c1e0..1019efc 100644
--- a/docs/inference/parameters-estimates.html
+++ b/docs/inference/parameters-estimates.html
@@ -224,23 +224,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -257,37 +263,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -304,31 +310,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -345,49 +351,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -430,15 +436,15 @@ <h1 class="title">
   </div>
   
 
-</header><p>Opinion polling has been conducted since the 19th century. The general goal is to describe the opinions held by a specific population on a given set of topics. In recent times, these polls have been pervasive in the US during presidential elections. Polls are useful when interviewing every member of a particular population is logistically impossible. The general strategy is to interview a smaller group, chosen at random, and then infer the opinions of the entire population from the opinions of the smaller group. Statistical theory is used to justify the process. This theory is referred to as <em>inference</em> and it is the main topic of this part of the book.</p>
-<p>Perhaps the best known opinion polls are those conducted to determine which candidate is preferred by voters in a given election. Political strategists make extensive use of polls to decide, among other things, how to invest resources. For example, they may want to know in which geographical locations to focus their “get out the vote” efforts.</p>
-<p>Elections are a particularly interesting case of opinion polls because the actual opinion of the entire population is revealed on election day. Of course, it costs millions of dollars to run an actual election which makes polling a cost effective strategy for those that want to forecast the results. Apart from strategist, news organizations are also interested in forecasting elections, as there appears to be demand for these.</p>
+</header><p>Opinion polling has been conducted since the 19th century. The general aim is to describe the opinions held by a specific population on a given set of topics. In recent times, these polls have been pervasive in the US during presidential elections. Polls are useful when interviewing every member of a specific population is logistically impossible. The general strategy involves interviewing a smaller, randomly chosen group and then inferring the opinions of the entire population from those of this subset. Statistical theory, known as <em>inference</em>, is used to justify the process and is the primary focus of this part of the book.</p>
+<p>Perhaps the best known opinion polls are those conducted to determine which candidate is preferred by voters in a given election. Political strategists extensively use polls to decide, among other things, where to allocate resources, such as determining the geographical locations to focus their “get out the vote” efforts.</p>
+<p>Elections are a particularly interesting instances of opinion polls because reveal the actual opinion of the entire population on election day. Of course, it costs millions of dollars to run an real election, which makes polling a cost-effective strategy for those seeking to forecast the results. In addition to strategist, news organizations are also interested in forecasting elections due to the apparent demand for what they reveal.</p>
 <section id="the-sampling-model-for-polls" class="level2" data-number="6.1"><h2 data-number="6.1" class="anchored" data-anchor-id="the-sampling-model-for-polls">
 <span class="header-section-number">6.1</span> The sampling model for polls</h2>
 <p>We start by connecting probability theory to the task of using polls to learn about a population.</p>
-<p>Although typically the results of these polls are kept private, similar polls are conducted by news organizations because results tend to be of interest to the general public and made public. We will eventually be looking at such data.</p>
-<p>Real Clear Politics<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> is an example of a news aggregator that organizes and publishes poll results. For example, they present the following poll results reporting estimates of the popular vote for the 2016 presidential election<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>:</p>
-<div class="cell" data-layout-align="center" data-hash="parameters-estimates_cache/html/unnamed-chunk-2_de01964bcf645f52b087e2b29681982a">
+<p>Although typically the results of polls run by political candidates are kept private, polls are also conducted by news organizations because results tend to be of interest to the general public and made public. We will eventually be looking at these public datasets.</p>
+<p>Real Clear Politics<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> is an example of a news aggregator that organizes and publishes poll results. For example, they present the following poll results, reporting estimates of the popular vote for the 2016 presidential election<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>:</p>
+<div class="cell" data-layout-align="center" data-hash="parameters-estimates_cache/html/unnamed-chunk-2_bede4d4844e0ab80705ec210a210e7a4">
 <div class="cell-output-display">
 <table class="table table-striped table-sm small" data-quarto-postprocess="true">
 <thead><tr class="header">
@@ -582,8 +588,8 @@ <h1 class="title">
 </div>
 </div>
 <!-- (Source: [Real Clear Politics](https://www.realclearpolitics.com/epolls/2016/president/us/general_election_trump_vs_clinton-5491.html)) -->
-<p>Let’s make some observations about the table above. First, note that different polls, all taken days before the election, report a different <em>spread</em>: the estimated difference between support for the two candidates. Notice also that the reported spreads hover around what ended up being the actual result: Clinton won the popular vote by 2.1%. We also see a column titled <strong>MoE</strong> which stands for <em>margin of error</em>.</p>
-<p>To help us understand the connection between polls and what we have learned, let’s construct a similar situation to the one pollsters face. To mimic the challenge real pollsters face in terms of competing with other pollsters for media attention, we will use an urn full of beads to represent voters and pretend we are competing for a $25 dollar prize. The challenge is to guess the spread between the proportion of blue and red beads in this urn (in this case, a pickle jar):</p>
+<p>Let’s make some observations about the table above. First, observe that different polls, all conducted days before the election, report different <em>spreads</em>: the estimated difference between support for the two candidates. Notice that the reported spreads hover around what eventually became the actual result: Clinton won the popular vote by 2.1%. Additionally, we o see a column titled <strong>MoE</strong> which stands for <em>margin of error</em>.</p>
+<p>To help us understand the connection between polls and what we have learned, let’s construct a situation similar to what pollsters face. To simulate the challenge pollsters encounter in terms of competing with other pollsters for media attention, we will use an urn filled with beads to represent voters, and pretend we are competing for a $25 dollar prize. The challenge is to guess the spread between the proportion of blue and red beads in this urn (in this case, a pickle jar):</p>
 <div class="cell" data-layout-align="center" data-hash="parameters-estimates_cache/html/unnamed-chunk-3_c33d9a845a71ac605851e10b6ba7ccc2">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -592,7 +598,7 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Before making a prediction, you can take a sample (with replacement) from the urn. To mimic the fact that running polls is expensive, it costs you $0.10 per each bead you sample. Therefore, if your sample size is 250, and you win, you will break even since you will pay $25 to collect your $25 prize. Your entry into the competition can be an interval. If the interval you submit contains the true proportion, you get half what you paid and pass to the second phase of the competition. In the second phase, the entry with the smallest interval is selected as the winner.</p>
+<p>Before making a prediction, you can take a sample (with replacement) from the urn. To reflect the fact that running polls is expensive, it costs you $0.10 for each bead you sample. Therefore, if your sample size is 250, and you win, you will break even since you would have paid $25 to collect your $25 prize. Your entry into the competition can be an interval. If the interval you submit contains the true proportion, you receive half what you paid and proceed to the second phase of the competition. In the second phase, the entry with the smallest interval is selected as the winner.</p>
 <p>The <strong>dslabs</strong> package includes a function that shows a random draw from this urn:</p>
 <div class="cell" data-layout-align="center" data-hash="parameters-estimates_cache/html/unnamed-chunk-4_da3006668a071503650e8a5cea458dc0">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
@@ -608,14 +614,14 @@ <h1 class="title">
 </div>
 </div>
 <p>Think about how you would construct your interval based on the data shown above.</p>
-<p>We have just described a simple sampling model for opinion polls. The beads inside the urn represent the individuals that will vote on election day. Those that will vote for the Republican candidate are represented with red beads and the Democrats with the blue beads. For simplicity, assume there are no other colors. That is, that there are just two parties: Republican and Democratic.</p>
+<p>We have just described a simple sampling model for opinion polls. In this model, the beads inside the urn represent individuals who will vote on election day. The red beads represent those voting for the Republican candidate, while the blue beads represent the Democrats. For simplicity, let’s assume there are no other colors;that is, that there are just two parties: Republican and Democratic.</p>
 </section><section id="populations-samples-parameters-and-estimates" class="level2" data-number="6.2"><h2 data-number="6.2" class="anchored" data-anchor-id="populations-samples-parameters-and-estimates">
 <span class="header-section-number">6.2</span> Populations, samples, parameters, and estimates</h2>
 <p>We want to predict the proportion of blue beads in the urn. Let’s call this quantity <span class="math inline">\(p\)</span>, which then tells us the proportion of red beads <span class="math inline">\(1-p\)</span>, and the spread <span class="math inline">\(p - (1-p)\)</span>, which simplifies to <span class="math inline">\(2p - 1\)</span>.</p>
-<p>In statistical textbooks, the beads in the urn are called the <em>population</em>. The proportion of blue beads in the population <span class="math inline">\(p\)</span> is called a <em>parameter</em>. The 25 beads we see in the previous plot are called a <em>sample</em>. The task of statistical inference is to predict the parameter <span class="math inline">\(p\)</span> using the observed data in the sample.</p>
+<p>In statistical textbooks, the beads in the urn are called the <em>population</em>. The proportion of blue beads in the population <span class="math inline">\(p\)</span> is called a <em>parameter</em>. The 25 beads we see in the previous plot are called a <em>sample</em>. The goal of statistical inference is to predict the parameter <span class="math inline">\(p\)</span> based on the observed data in the sample.</p>
 <p>Can we do this with the 25 observations above? It is certainly informative. For example, given that we see 13 red and 12 blue beads, it is unlikely that <span class="math inline">\(p\)</span> &gt; .9 or <span class="math inline">\(p\)</span> &lt; .1. But are we ready to predict with certainty that there are more red beads than blue in the jar?</p>
 <p>We want to construct an estimate of <span class="math inline">\(p\)</span> using only the information we observe. An estimate should be thought of as a summary of the observed data that we think is informative about the parameter of interest. It seems intuitive to think that the proportion of blue beads in the sample <span class="math inline">\(0.48\)</span> must be at least related to the actual proportion <span class="math inline">\(p\)</span>. But do we simply predict <span class="math inline">\(p\)</span> to be 0.48? First, remember that the sample proportion is a random variable. If we run the command <code>take_poll(25)</code> four times, we get a different answer each time, since the sample proportion is a random variable.</p>
-<div class="cell" data-layout-align="center" data-hash="parameters-estimates_cache/html/four-simulated-polls_c732861321c5956a01cbdd2278590c10">
+<div class="cell" data-layout-align="center" data-hash="parameters-estimates_cache/html/four-simulated-polls_e9b26652b9c67beb4c1cf10c24690f38">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="parameters-estimates_files/figure-html/four-simulated-polls-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -623,33 +629,43 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Note that in the four random samples shown above, the sample proportions range from 0.44 to 0.60. By describing the distribution of this random variable, we will be able to gain insights into how good this estimate is and how we can make it better.</p>
+<p>Observe that in the four random samples shown above, the sample proportions range from 0.44 to 0.60. By describing the distribution of this random variable, we will be able to gain insights into how good this estimate is and how we can improve it.</p>
 <section id="the-sample-average" class="level3" data-number="6.2.1"><h3 data-number="6.2.1" class="anchored" data-anchor-id="the-sample-average">
 <span class="header-section-number">6.2.1</span> The sample average</h3>
-<p>Conducting an opinion poll is being modeled as taking a random sample from an urn. We are proposing the use of the proportion of blue beads in our sample as an <em>estimate</em> of the parameter <span class="math inline">\(p\)</span>. Once we have this estimate, we can easily report an estimate for the spread <span class="math inline">\(2p-1\)</span>, but for simplicity we will illustrate the concepts for estimating <span class="math inline">\(p\)</span>. We will use our knowledge of probability to defend our use of the sample proportion and quantify how close we think it is from the population proportion <span class="math inline">\(p\)</span>.</p>
-<p>We start by defining the random variable <span class="math inline">\(X\)</span> as: <span class="math inline">\(X=1\)</span> if we pick a blue bead at random and <span class="math inline">\(X=0\)</span> if it is red. This implies that the population is a list of 0s and 1s. If we sample <span class="math inline">\(N\)</span> beads, then the average of the draws <span class="math inline">\(X_1, \dots, X_N\)</span> is equivalent to the proportion of blue beads in our sample. This is because adding the <span class="math inline">\(X\)</span>s is equivalent to counting the blue beads and dividing this count by the total <span class="math inline">\(N\)</span> is equivalent to computing a proportion. We use the symbol <span class="math inline">\(\bar{X}\)</span> to represent this average. In general, in statistics textbooks a bar on top of a symbol means the average. The theory we just learned about the sum of draws becomes useful because the average is a sum of draws multiplied by the constant <span class="math inline">\(1/N\)</span>:</p>
+<p>Conducting an opinion poll is being modeled as taking a random sample from an urn. We propose using the proportion of blue beads in our sample as an <em>estimate</em> of the parameter <span class="math inline">\(p\)</span>. Once we have this estimate, we can easily report an estimate for the spread <span class="math inline">\(2p-1\)</span>. However, for simplicity, we will illustrate the concepts for estimating <span class="math inline">\(p\)</span>. We will use our knowledge of probability to justify our use of the sample proportion and to quantify its proximity to the population proportion <span class="math inline">\(p\)</span>.</p>
+<p>We start by defining the random variable <span class="math inline">\(X\)</span> as <span class="math inline">\(X=1\)</span>, if we pick a blue bead at random, and <span class="math inline">\(X=0\)</span> if it is red. This implies that the population is a list of 0s and 1s. If we sample <span class="math inline">\(N\)</span> beads, then the average of the draws <span class="math inline">\(X_1, \dots, X_N\)</span> is equivalent to the proportion of blue beads in our sample. This is because adding the <span class="math inline">\(X\)</span>s is equivalent to counting the blue beads, and dividing this count by the total <span class="math inline">\(N\)</span> is equivalent to computing a proportion. We use the symbol <span class="math inline">\(\bar{X}\)</span> to represent this average. In statistics textbooks, a bar on top of a symbol typically denotes the average. The theory we just covered about the sum of draws becomes useful because the average is a sum of draws multiplied by the constant <span class="math inline">\(1/N\)</span>:</p>
 <p><span class="math display">\[\bar{X} = \frac{1}{N} \sum_{i=1}^N X_i\]</span></p>
-<p>For simplicity, let’s assume that the draws are independent: after we see each sampled bead, we return it to the urn. In this case, what do we know about the distribution of the sum of draws? First, we know that the expected value of the sum of draws is <span class="math inline">\(N\)</span> times the average of the values in the urn. We know that the average of the 0s and 1s in the urn must be <span class="math inline">\(p\)</span>, the proportion of blue beads.</p>
-<p>Here we encounter an important difference with what we did in the Probability chapter: we don’t know what is in the urn. We know there are blue and red beads, but we don’t know how many of each. This is what we want to find out: we are trying to <strong>estimate</strong> <span class="math inline">\(p\)</span>.</p>
+<p>For simplicity, let’s assume that the draws are independent; after we see each sampled bead, we return it to the urn. In this case, what do we know about the distribution of the sum of draws? Firstly, we know that the expected value of the sum of draws is <span class="math inline">\(N\)</span> times the average of the values in the urn. We know that the average of the 0s and 1s in the urn must be <span class="math inline">\(p\)</span>, the proportion of blue beads.</p>
+<p>Here, we encounter an important difference compared to what we did in the section on probability: we don’t know the composition of the urn. While we know there are blue and red beads, we don’t know how many of each. This is what we want to find out: we are trying to <strong>estimate</strong> <span class="math inline">\(p\)</span>.</p>
 </section><section id="parameters" class="level3" data-number="6.2.2"><h3 data-number="6.2.2" class="anchored" data-anchor-id="parameters">
 <span class="header-section-number">6.2.2</span> Parameters</h3>
-<p>Just like we use variables to define unknowns in systems of equations, in statistical inference we define <em>parameters</em> to define unknown parts of our models. In the urn model which we are using to mimic an opinion poll, we do not know the proportion of blue beads in the urn. We define the parameters <span class="math inline">\(p\)</span> to represent this quantity. <span class="math inline">\(p\)</span> is the average of the urn because if we take the average of the 1s (blue) and 0s (red), we get the proportion of blue beads. Since our main goal is figuring out what is <span class="math inline">\(p\)</span>, we are going to <em>estimate this parameter</em>.</p>
-<p>The ideas presented here on how we estimate parameters, and provide insights into how good these estimates are, extrapolate to many data science tasks. For example, we may want to determine the difference in health improvement between patients receiving treatment and a control group. We may ask, what are the health effects of smoking on a population? What are the differences in racial groups of fatal shootings by police? What is the rate of change in life expectancy in the US during the last 10 years? All these questions can be framed as a task of estimating a parameter from a sample.</p>
+<p>Just as we use variables to define unknowns in systems of equations, in statistical inference, we define <em>parameters</em> to represent unknown parts of our models. In the urn model, which we are using to simulate an opinion poll, we do not know the proportion of blue beads in the urn. We define the parameters <span class="math inline">\(p\)</span> to represent this quantity. Since our main goal is determining <span class="math inline">\(p\)</span>, we are going to <em>estimate this parameter</em>.</p>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>Introductory statistics textbooks usually use the population average as the first example of a parameter. Note that in our example the parameter of interest <span class="math inline">\(p\)</span> is defined by the proportion of 1s (blue) and 0s (red) in the urn, which is also the average of the numbers in the urn. Our parameter of interest can therefore be thought of as a population average.</p>
+</div>
+</div>
+</div>
+<p>The concepts presented here on how we estimate parameters, and provide insights into how good these estimates are, extend to many data analysis tasks. For example, we may want to determine the difference in health improvement between patients receiving treatment and a control group, investigate the health effects of smoking on a population, analyze the differences in racial groups of fatal shootings by police, or assess the rate of change in life expectancy in the US during the last 10 years. All these questions can be framed as a task of estimating a parameter from a sample.</p>
 </section></section><section id="polling-versus-forecasting" class="level2" data-number="6.3"><h2 data-number="6.3" class="anchored" data-anchor-id="polling-versus-forecasting">
 <span class="header-section-number">6.3</span> Polling versus forecasting</h2>
-<p>Before we continue, let’s make an important clarification related to the practical problem of forecasting the election. If a poll is conducted four months before the election, it is estimating the <span class="math inline">\(p\)</span> for that moment and not for election day. The <span class="math inline">\(p\)</span> for election night might be different since people’s opinions fluctuate through time. The polls provided the night before the election tend to be the most accurate since opinions don’t change that much in a day. However, forecasters try to build tools that model how opinions vary across time and try to predict the election night results taking into consideration the fact that opinions fluctuate. We will describe some approaches for doing this in a later section.</p>
+<p>Before we continue, it’s important to clarify a practical issue related to forecasting an election. If a poll is conducted four months before the election, it is estimating the <span class="math inline">\(p\)</span> for that moment, and not for election day. The <span class="math inline">\(p\)</span> for election night might be different, as people’s opinions tend to fluctuate through time. Generally, the polls conducted the night before the election tend to be the most accurate, since opinions do not change significantly in a day. However, forecasters try to develop tools that model how opinions vary over time and aim to predict the election night results by taking into consideration these fluctuations. We will explore some approaches for doing this in a later section.</p>
 </section><section id="properties-of-our-estimate-expected-value-and-standard-error" class="level2" data-number="6.4"><h2 data-number="6.4" class="anchored" data-anchor-id="properties-of-our-estimate-expected-value-and-standard-error">
 <span class="header-section-number">6.4</span> Properties of our estimate: expected value and standard error</h2>
 <p>To understand how good our estimate is, we will describe the statistical properties of the random variable defined above: the sample proportion <span class="math inline">\(\bar{X}\)</span>. Remember that <span class="math inline">\(\bar{X}\)</span> is the sum of independent draws so the rules we covered in the probability chapter apply.</p>
-<p>Using what we have learned, the expected value of the sum <span class="math inline">\(N\bar{X}\)</span> is <span class="math inline">\(N \times\)</span> the average of the urn, <span class="math inline">\(p\)</span>. So dividing by the non-random constant <span class="math inline">\(N\)</span> gives us that the expected value of the average <span class="math inline">\(\bar{X}\)</span> is <span class="math inline">\(p\)</span>. We can write it using our mathematical notation:</p>
+<p>Applying the concepts we have learned, the expected value of the sum <span class="math inline">\(N\bar{X}\)</span> is <span class="math inline">\(N \times\)</span> the average of the urn, denoted as <span class="math inline">\(p\)</span>. Dividing by the non-random constant <span class="math inline">\(N\)</span> yields the expected value of the average <span class="math inline">\(\bar{X}\)</span> as <span class="math inline">\(p\)</span>. We can write it using our mathematical notation:</p>
 <p><span class="math display">\[
 \mbox{E}(\bar{X}) = p
 \]</span></p>
-<p>We can also use what we learned to figure out the standard error: the standard error of the sum is <span class="math inline">\(\sqrt{N} \times\)</span> the standard deviation of the urn. Can we compute the standard error of the urn? We learned a formula that tells us that it is <span class="math inline">\((1-0) \sqrt{p (1-p)}\)</span> = <span class="math inline">\(\sqrt{p (1-p)}\)</span>. Because we are dividing the sum by <span class="math inline">\(N\)</span>, we arrive at the following formula for the standard error of the average:</p>
+<p>We can also use what we learned to determine the standard error: the standard error of the sum is <span class="math inline">\(\sqrt{N} \times\)</span> the standard deviation of the urn. Can we compute the standard error of the urn? We learned a formula that tells us it is <span class="math inline">\((1-0) \sqrt{p (1-p)}\)</span> = <span class="math inline">\(\sqrt{p (1-p)}\)</span>. Because we are dividing the sum by <span class="math inline">\(N\)</span>, we arrive at the following formula for the standard error of the average:</p>
 <p><span class="math display">\[
 \mbox{SE}(\bar{X}) = \sqrt{p(1-p)/N}
 \]</span></p>
-<p>This result reveals the power of polls. The expected value of the sample proportion <span class="math inline">\(\bar{X}\)</span> is the parameter of interest <span class="math inline">\(p\)</span> and we can make the standard error as small as we want by increasing <span class="math inline">\(N\)</span>. The law of large numbers tells us that with a large enough poll, our estimate converges to <span class="math inline">\(p\)</span>.</p>
+<p>This result reveals the power of polls. The expected value of the sample proportion <span class="math inline">\(\bar{X}\)</span> is the parameter of interest <span class="math inline">\(p\)</span>, and we can make the standard error as small as we want by increasing <span class="math inline">\(N\)</span>. The law of large numbers tells us that with a large enough poll, our estimate converges to <span class="math inline">\(p\)</span>.</p>
 <p>If we take a large enough poll to make our standard error about 1%, we will be quite certain about who will win. But how large does the poll have to be for the standard error to be this small?</p>
 <p>One problem is that we do not know <span class="math inline">\(p\)</span>, so we can’t compute the standard error. However, for illustrative purposes, let’s assume that <span class="math inline">\(p=0.51\)</span> and make a plot of the standard error versus the sample size <span class="math inline">\(N\)</span>:</p>
 <div class="cell" data-layout-align="center" data-hash="parameters-estimates_cache/html/standard-error-versus-sample-size_8408d65c2c054da5c2b582f41c73ced1">
@@ -660,27 +676,27 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>From the plot we see that we would need a poll of over 10,000 people to get the standard error that low. We rarely see polls of this size due in part to costs. From the Real Clear Politics table, we learn that the sample sizes in opinion polls range from 500-3,500 people. For a sample size of 1,000 and <span class="math inline">\(p=0.51\)</span>, the standard error is:</p>
+<p>The plot shows that we would need a poll of over 10,000 people to achieve a standard error that low. We rarely see polls of this size due in part to the associated costs. According to the Real Clear Politics table, sample sizes in opinion polls range from 500-3,500 people. For a sample size of 1,000 and <span class="math inline">\(p=0.51\)</span>, the standard error is:</p>
 <div class="cell" data-layout-align="center" data-hash="parameters-estimates_cache/html/unnamed-chunk-5_f4f567425d24760e4dc179c1ce0cdaee">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">p</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p</span><span class="op">)</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">1000</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.0158</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>or 1.5 percentage points. So even with large polls, for close elections, <span class="math inline">\(\bar{X}\)</span> can lead us astray if we don’t realize it is a random variable. Nonetheless, we can actually say more about how close we get the <span class="math inline">\(p\)</span> and we do that in Section <a href="clt.html"><span>Chapter&nbsp;7</span></a>.</p>
+<p>or 1.5 percentage points. So even with large polls, for close elections, <span class="math inline">\(\bar{X}\)</span> can lead us astray if we don’t realize it is a random variable. Nonetheless, we can actually say more about how close we get the <span class="math inline">\(p\)</span> and we do that in <a href="clt.html"><span>Chapter&nbsp;7</span></a>.</p>
 </section><section id="exercises" class="level2" data-number="6.5"><h2 data-number="6.5" class="anchored" data-anchor-id="exercises">
 <span class="header-section-number">6.5</span> Exercises</h2>
-<p>1. Suppose you poll a population in which a proportion <span class="math inline">\(p\)</span> of voters are Democrats and <span class="math inline">\(1-p\)</span> are Republicans. Your sample size is <span class="math inline">\(N=25\)</span>. Consider the random variable <span class="math inline">\(S\)</span> which is the <strong>total</strong> number of Democrats in your sample. What is the expected value of this random variable? Hint: it’s a function of <span class="math inline">\(p\)</span>.</p>
-<p>2. What is the standard error of <span class="math inline">\(S\)</span> ? Hint: it’s a function of <span class="math inline">\(p\)</span>.</p>
-<p>3. Consider the random variable <span class="math inline">\(S/N\)</span>. This is equivalent to the sample average, which we have been denoting as <span class="math inline">\(\bar{X}\)</span>. What is the expected value of the <span class="math inline">\(\bar{X}\)</span>? Hint: it’s a function of <span class="math inline">\(p\)</span>.</p>
-<p>4. What is the standard error of <span class="math inline">\(\bar{X}\)</span>? Hint: it’s a function of <span class="math inline">\(p\)</span>.</p>
+<p>1. Suppose you poll a population in which a proportion <span class="math inline">\(p\)</span> of voters are Democrats and <span class="math inline">\(1-p\)</span> are Republicans. Your sample size is <span class="math inline">\(N=25\)</span>. Consider the random variable <span class="math inline">\(S\)</span>, which is the <strong>total</strong> number of Democrats in your sample. What is the expected value of this random variable? Hint: It’s a function of <span class="math inline">\(p\)</span>.</p>
+<p>2. What is the standard error of <span class="math inline">\(S\)</span> ? Hint: It’s a function of <span class="math inline">\(p\)</span>.</p>
+<p>3. Consider the random variable <span class="math inline">\(S/N\)</span>. This is equivalent to the sample average, which we have been denoting as <span class="math inline">\(\bar{X}\)</span>. What is the expected value of the <span class="math inline">\(\bar{X}\)</span>? Hint: It’s a function of <span class="math inline">\(p\)</span>.</p>
+<p>4. What is the standard error of <span class="math inline">\(\bar{X}\)</span>? Hint: It’s a function of <span class="math inline">\(p\)</span>.</p>
 <p>5. Write a line of code that gives you the standard error <code>se</code> for the problem above for several values of <span class="math inline">\(p\)</span>, specifically for <code>p &lt;- seq(0, 1, length = 100)</code>. Make a plot of <code>se</code> versus <code>p</code>.</p>
 <p>6. Copy the code above and put it inside a for-loop to make the plot for <span class="math inline">\(N=25\)</span>, <span class="math inline">\(N=100\)</span>, and <span class="math inline">\(N=1000\)</span>.</p>
 <p>7. If we are interested in the difference in proportions, <span class="math inline">\(\mu = p - (1-p)\)</span>, our estimate is <span class="math inline">\(\hat{\mu} = \bar{X} - (1-\bar{X})\)</span>. Use the rules we learned about sums of random variables and scaled random variables to derive the expected value of <span class="math inline">\(\hat{\mu}\)</span>.</p>
 <p>8. What is the standard error of <span class="math inline">\(\hat{\mu}\)</span>?</p>
-<p>9. If the actual <span class="math inline">\(p=.45\)</span>, it means the Republicans are winning by a relatively large margin since <span class="math inline">\(\mu = -.1\)</span>, which is a 10% margin of victory. In this case, what is the standard error of <span class="math inline">\(2\hat{X}-1\)</span> if we take a sample of <span class="math inline">\(N=25\)</span>?</p>
-<p>10. Given the answer to 9, which of the following best describes your strategy of using a sample size of <span class="math inline">\(N=25\)</span>?</p>
+<p>9. If the actual <span class="math inline">\(p=.45\)</span>, it means the Republicans are winning by a relatively large margin, since <span class="math inline">\(\mu = -.1\)</span>, which is a 10% margin of victory. In this case, what is the standard error of <span class="math inline">\(2\hat{X}-1\)</span> if we take a sample of <span class="math inline">\(N=25\)</span>?</p>
+<p>10. Given the answer to exercise 9, which of the following best describes your strategy of using a sample size of <span class="math inline">\(N=25\)</span>?</p>
 <ol type="a">
-<li>The expected value of our estimate <span class="math inline">\(2\bar{X}-1\)</span> is <span class="math inline">\(\mu\)</span>, so our prediction will be right on.</li>
-<li>Our standard error is larger than the difference, so the chances of <span class="math inline">\(2\bar{X}-1\)</span> being positive and throwing us off were not that small. We should pick a larger sample size.</li>
+<li>The expected value of our estimate <span class="math inline">\(2\bar{X}-1\)</span> is <span class="math inline">\(\mu\)</span>, so our prediction will be accurate.</li>
+<li>Our standard error is larger than the difference, so the chances of <span class="math inline">\(2\bar{X}-1\)</span> representing a large margin are not small. We should pick a larger sample size.</li>
 <li>The difference is 10% and the standard error is about 0.2, therefore much smaller than the difference.</li>
 <li>Because we don’t know <span class="math inline">\(p\)</span>, we have no way of knowing that making <span class="math inline">\(N\)</span> larger would actually improve our standard error.</li>
 </ol>
diff --git a/docs/intro.html b/docs/intro.html
index b423ceb..760bf14 100644
--- a/docs/intro.html
+++ b/docs/intro.html
@@ -203,23 +203,29 @@
   <a href="./inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -236,37 +242,37 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -283,31 +289,31 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -324,49 +330,49 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="./ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -406,23 +412,27 @@ <h1 class="title">Introduction</h1>
 
 </header>
 
-<p>Over the years, data analysts have developed ideas, concepts, and methodologies applicable across a broad range of projects. They’ve also identified common ways to get fooled by apparent patterns in the data and important mathematical realities that are not immediately obvious. This collective wisdom has evolved into the field of Statistics, a discipline offering a mathematical framework to simplify the articulation and rigorous assessment of these concepts. For a data analyst, it’s crucial to have a comprehensive understanding of this field to prevent repeated errors and unnecessary reinvention of methodologies.</p>
-<p>There is no shortage of exceptional Statistics textbooks detailing this mathematical framework. However, in this book, we emphasize bridging theory and practice, applying these concepts to actual real-world challenges using data examples and in-depth case studies. We provide representative case studies that mirror what a practicing data analyst experiences. These include election forecasting, baseball team construction, biology experiments, movie recommendation systems, and deciphering hand-written digits. In each case study, we present and break down the R code applied to solve the problem. We also use R code to elucidate key statistical concepts often discussed in a mathematical context.</p>
-<p>The book is divided into six sections: <strong>Summary Statistics</strong>, <strong>Probability</strong>, <strong>Statistical Inference</strong>, <strong>Linear Models</strong>, <strong>High Dimensional Data</strong> and <strong>Machine Learning</strong>. Although the the first two parts use data examples to illustrate concepts, real-world case studies don’t appear until the third part. Each part comprises several chapters, each roughly designed for a single lecture and including a variety of exercises. All data referenced in the book is included in the <strong>dslabs</strong> package with all the Quarto code used to generate the book available on <a href="https://github.com/rafalab/dsbook-part-2">GitHub</a><a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>.</p>
+<p>The phrase <em>data science</em> began gaining significant popularity around 2012, thanks in part to the publication titled <em>“Data Scientist: The Most Alluring Profession of the 21st Century”</em><a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>. This aligns with the rise of a new kind of endeavor in the technology sector and some academic projects during the 2000s: the extraction of insights from messy, complex, and large datasets, which had become increasingly prevalent, all mad possible with the advent of digital storage of data.</p>
+<p>Some examples include using data from various political pollsters to improve election predictions, extracting information from athletic department websites to evaluate baseball prospects, analyzing movie ratings from all streaming service users to make personalized recommendations, developing software to read zip codes by digitizing written digits, and using advanced measurement technologies to understand the molecular causes of diseases. This book is centered around these, and other practical examples.</p>
+<p>Achieving success in these instances involves a collaborative effort by a team of experts with different but complementary skills. In this book, our primary focus is on data analysis. To grasp the best ways to analyze data effectively in the mentioned examples, we will cover key mathematical concepts. Some of these concepts are not new and were originally developed for different purposes, but they have proven to be adaptable and useful in various contexts.</p>
+<p>Over the past several decades, data analysts have developed ideas, concepts, and methodologies applicable across a broad range of projects. They’ve also identified common ways to get fooled by apparent patterns in the data and important mathematical realities that are not immediately obvious. This collective wisdom has evolved into the field of Statistics, a discipline offering a mathematical framework to simplify the articulation and rigorous assessment of these concepts. For a data analyst, it’s crucial to have a comprehensive understanding of this field to prevent repeated errors and unnecessary reinvention of methodologies.</p>
+<p>There is no shortage of exceptional Statistics textbooks detailing this mathematical framework. In this book, we emphasize bridging theory and practice, applying these concepts to actual real-world challenges using data examples and in-depth case studies. We provide representative case studies that mirror what a practicing data analyst experiences. In each case study, we present and break down the R code applied to solve the problem. We also use R code to elucidate key statistical concepts often discussed in a mathematical context.</p>
+<p>The book is divided into six sections: <strong>Summary Statistics</strong>, <strong>Probability</strong>, <strong>Statistical Inference</strong>, <strong>Linear Models</strong>, <strong>High Dimensional Data</strong> and <strong>Machine Learning</strong>. Although the the first two parts use data examples to illustrate concepts, the real-world case studies don’t appear until the third part. Each part comprises several chapters, each roughly designed for a single lecture and including a variety of exercises. All data referenced in the book is included in the <strong>dslabs</strong> package with all the Quarto code used to generate the book available on <a href="https://github.com/rafalab/dsbook-part-2">GitHub</a><a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>.</p>
 <section id="who-will-find-this-book-useful" class="level2 unnumbered">
 <h2 class="unnumbered anchored" data-anchor-id="who-will-find-this-book-useful">Who will find this book useful?</h2>
-<p>This book is meant to be a textbook for a second course in Data Science. Previous knowledge of R, such as that covered in <a href="http://rafalab.dfci.harvard.edu/dsbook-part-1/">Introduction to Data Science</a>, is necessary. If you read and understand all the chapters and complete all the exercises in this book, you will be well-positioned to perform advanced data analysis tasks and you will be prepared to learn the more advanced concepts and skills needed to become an expert.</p>
+<p>This book is meant to be a textbook for a second course in Data Science with a focus on data analysis. Previous knowledge of R, such as that covered in <a href="http://rafalab.dfci.harvard.edu/dsbook-part-1/">Introduction to Data Science</a>, is necessary. If you read and understand all the chapters and complete all the exercises in this book, you will be well-positioned to perform advanced data analysis tasks and you will be prepared to learn the more advanced concepts and skills needed to become an expert.</p>
 </section>
 <section id="what-is-not-covered-by-this-book" class="level2 unnumbered">
 <h2 class="unnumbered anchored" data-anchor-id="what-is-not-covered-by-this-book">What is not covered by this book?</h2>
-<p>This book focuses on the application of statistical and machine learning methods in data analysis. We do not go in depth into the theoretical aspects of the methods, and highly recommend complementing this book with probability and statistics textbooks.</p>
+<p>This book focuses on the application of statistical and machine learning methods in data analysis. We do not go in depth into the theoretical aspects of the methods, and highly recommend complementing this book with probability and statistics textbooks. We also do not cover aspects related to data management or engineering. Although R programming is an essential part of the book, we do not teach more advanced computer science topics such as data structures, optimization, and algorithm theory. Similarly, we do not cover topics such as web services, interactive graphics, parallel computing, and data streaming processing.</p>
 
 
 </section>
 <section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes">
 <hr>
 <ol>
-<li id="fn1"><p>https://github.com/rafalab/dsbook-part-2<a href="#fnref1" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
+<li id="fn1"><p>https://hbr.org/2012/10/data-scientist-the-sexiest-job-of-the-21st-century<a href="#fnref1" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
+<li id="fn2"><p>https://github.com/rafalab/dsbook-part-2<a href="#fnref2" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
 </ol>
 </section>
 
diff --git a/docs/linear-models/association-not-causation.html b/docs/linear-models/association-not-causation.html
index 48dc681..146bf26 100644
--- a/docs/linear-models/association-not-causation.html
+++ b/docs/linear-models/association-not-causation.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 18&nbsp; Association is not causation</title>
+<title>Advanced Data Science - 19&nbsp; Association is not causation</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../linear-models/intro-to-linear-models.html">Linear Models</a></li><li class="breadcrumb-item"><a href="../linear-models/association-not-causation.html"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../linear-models/intro-to-linear-models.html">Linear Models</a></li><li class="breadcrumb-item"><a href="../linear-models/association-not-causation.html"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,25 +405,25 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#spurious-correlation" id="toc-spurious-correlation" class="nav-link active" data-scroll-target="#spurious-correlation"><span class="header-section-number">18.1</span> Spurious correlation</a></li>
-  <li><a href="#outliers" id="toc-outliers" class="nav-link" data-scroll-target="#outliers"><span class="header-section-number">18.2</span> Outliers</a></li>
-  <li><a href="#reversing-cause-and-effect" id="toc-reversing-cause-and-effect" class="nav-link" data-scroll-target="#reversing-cause-and-effect"><span class="header-section-number">18.3</span> Reversing cause and effect</a></li>
+<li><a href="#spurious-correlation" id="toc-spurious-correlation" class="nav-link active" data-scroll-target="#spurious-correlation"><span class="header-section-number">19.1</span> Spurious correlation</a></li>
+  <li><a href="#outliers" id="toc-outliers" class="nav-link" data-scroll-target="#outliers"><span class="header-section-number">19.2</span> Outliers</a></li>
+  <li><a href="#reversing-cause-and-effect" id="toc-reversing-cause-and-effect" class="nav-link" data-scroll-target="#reversing-cause-and-effect"><span class="header-section-number">19.3</span> Reversing cause and effect</a></li>
   <li>
-<a href="#confounders" id="toc-confounders" class="nav-link" data-scroll-target="#confounders"><span class="header-section-number">18.4</span> Confounders</a>
+<a href="#confounders" id="toc-confounders" class="nav-link" data-scroll-target="#confounders"><span class="header-section-number">19.4</span> Confounders</a>
   <ul class="collapse">
-<li><a href="#example-uc-berkeley-admissions" id="toc-example-uc-berkeley-admissions" class="nav-link" data-scroll-target="#example-uc-berkeley-admissions"><span class="header-section-number">18.4.1</span> Example: UC Berkeley admissions</a></li>
-  <li><a href="#confounding-explained-graphically" id="toc-confounding-explained-graphically" class="nav-link" data-scroll-target="#confounding-explained-graphically"><span class="header-section-number">18.4.2</span> Confounding explained graphically</a></li>
-  <li><a href="#average-after-stratifying" id="toc-average-after-stratifying" class="nav-link" data-scroll-target="#average-after-stratifying"><span class="header-section-number">18.4.3</span> Average after stratifying</a></li>
+<li><a href="#example-uc-berkeley-admissions" id="toc-example-uc-berkeley-admissions" class="nav-link" data-scroll-target="#example-uc-berkeley-admissions"><span class="header-section-number">19.4.1</span> Example: UC Berkeley admissions</a></li>
+  <li><a href="#confounding-explained-graphically" id="toc-confounding-explained-graphically" class="nav-link" data-scroll-target="#confounding-explained-graphically"><span class="header-section-number">19.4.2</span> Confounding explained graphically</a></li>
+  <li><a href="#average-after-stratifying" id="toc-average-after-stratifying" class="nav-link" data-scroll-target="#average-after-stratifying"><span class="header-section-number">19.4.3</span> Average after stratifying</a></li>
   </ul>
 </li>
-  <li><a href="#simpsons-paradox" id="toc-simpsons-paradox" class="nav-link" data-scroll-target="#simpsons-paradox"><span class="header-section-number">18.5</span> Simpson’s paradox</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">18.6</span> Exercises</a></li>
+  <li><a href="#simpsons-paradox" id="toc-simpsons-paradox" class="nav-link" data-scroll-target="#simpsons-paradox"><span class="header-section-number">19.5</span> Simpson’s paradox</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">19.6</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/linear-models/association-not-causation.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
 <h1 class="title">
-<span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span>
+<span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span>
 </h1>
 </div>
 
@@ -431,11 +437,11 @@ <h1 class="title">
   </div>
   
 
-</header><p><em>Association is not causation</em> is perhaps the most important lesson one learns in a statistics class. <em>Correlation is not causation</em> is another way to say this. Throughout the Statistics part of the book, we have described tools useful for quantifying associations between variables. However, we must be careful not to over-interpret these associations.</p>
-<p>There are many reasons that a variable <span class="math inline">\(X\)</span> can be correlated with a variable <span class="math inline">\(Y\)</span> without having any direct effect on <span class="math inline">\(Y\)</span>. Here we examine four common ways that can lead to misinterpreting data.</p>
-<section id="spurious-correlation" class="level2" data-number="18.1"><h2 data-number="18.1" class="anchored" data-anchor-id="spurious-correlation">
-<span class="header-section-number">18.1</span> Spurious correlation</h2>
-<p>The following comical example underscores that correlation is not causation. It shows a very strong correlation between divorce rates and margarine consumption.</p>
+</header><p><em>Association is not causation</em> is perhaps the most important lesson one can learn in a statistics class. <em>Correlation is not causation</em> is another way to say this. Throughout the statistics part of the book, we have described tools useful for quantifying associations between variables. However, we must be careful not to over-interpret these associations.</p>
+<p>There are many reasons that a variable <span class="math inline">\(X\)</span> can be correlated with a variable <span class="math inline">\(Y\)</span>, without having any direct effect on <span class="math inline">\(Y\)</span>. Below we examine four common ways that can lead to misinterpreting data.</p>
+<section id="spurious-correlation" class="level2" data-number="19.1"><h2 data-number="19.1" class="anchored" data-anchor-id="spurious-correlation">
+<span class="header-section-number">19.1</span> Spurious correlation</h2>
+<p>The following comical example underscores the concept that correlation is not causation. It shows a very strong correlation between divorce rates and margarine consumption.</p>
 <div class="cell" data-layout-align="center">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -444,9 +450,9 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Does this mean that margarine causes divorces? Or do divorces cause people to eat more margarine? Of course the answer to both these questions is no. This is just an example of what we call a <em>spurious correlation</em>.</p>
+<p>Does this mean that margarine causes divorces? Or do divorces cause people to eat more margarine? Of course. the answer to both these questions is no. This is just an example of what we call a <em>spurious correlation</em>.</p>
 <p>You can see many more absurd examples on the Spurious Correlations website<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>.</p>
-<p>The cases presented in the spurious correlation site are all instances of what is generally called <em>data dredging</em>, <em>data fishing</em>, or <em>data snooping</em>. It’s basically a form of what in the US they call <em>cherry picking</em>. An example of data dredging would be if you look through many results produced by a random process and pick the one that shows a relationship that supports a theory you want to defend.</p>
+<p>The cases presented on the website are all instances of what is generally called <em>data dredging</em>, <em>data fishing</em>, or <em>data snooping</em>. It’s basically a form of what in the US they call <em>cherry picking</em>. An example of data dredging would be if you look through many results produced by a random process and pick the one that shows a relationship that supports a theory you want to defend.</p>
 <p>A Monte Carlo simulation can be used to show how data dredging can result in finding high correlations among uncorrelated variables. We will save the results of our simulation into a tibble:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/unnamed-chunk-1_ee3a35f0f2f37bec8785e782107f7913">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
@@ -456,7 +462,7 @@ <h1 class="title">
 <span>                   x <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">N</span><span class="op">*</span><span class="va">g</span><span class="op">)</span>, </span>
 <span>                   y <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">N</span><span class="op">*</span><span class="va">g</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The first column denotes group. We created groups and for each one we generated a pair of independent vectors, <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span>, with 25 observations each, stored in the second and third columns. Because we constructed the simulation, we know that <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are not correlated.</p>
+<p>The first column denotes group. We created groups. For each group, we generated a pair of independent vectors, <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span>, with 25 observations each, stored in the second and third columns. Because we constructed the simulation, we know that <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are not correlated.</p>
 <p>Next, we compute the correlation between <code>X</code> and <code>Y</code> for each group and look at the max:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/unnamed-chunk-2_b7323e6070f9ecc30c9a5a9dd897b0b4">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">res</span> <span class="op">&lt;-</span> <span class="va">sim_data</span> <span class="op">|&gt;</span> </span>
@@ -467,14 +473,14 @@ <h1 class="title">
 <span><span class="co">#&gt; # A tibble: 1,000,000 × 2</span></span>
 <span><span class="co">#&gt;    group     r</span></span>
 <span><span class="co">#&gt;    &lt;int&gt; &lt;dbl&gt;</span></span>
-<span><span class="co">#&gt; 1 606777 0.789</span></span>
-<span><span class="co">#&gt; 2 949026 0.781</span></span>
-<span><span class="co">#&gt; 3 752659 0.774</span></span>
-<span><span class="co">#&gt; 4 815223 0.773</span></span>
-<span><span class="co">#&gt; 5 890876 0.768</span></span>
+<span><span class="co">#&gt; 1 180152 0.796</span></span>
+<span><span class="co">#&gt; 2 902354 0.790</span></span>
+<span><span class="co">#&gt; 3 410920 0.785</span></span>
+<span><span class="co">#&gt; 4 796016 0.760</span></span>
+<span><span class="co">#&gt; 5 542451 0.760</span></span>
 <span><span class="co">#&gt; # ℹ 999,995 more rows</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We see a maximum correlation of 0.7892932 and if you just plot the data from the group achieving this correlation, it shows a convincing plot that <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are in fact correlated:</p>
+<p>We see a maximum correlation of 0.7957485. If you just plot the data from the group achieving this correlation, it shows a convincing plot that <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are in fact correlated:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/dredging_4fd72743ca89919170b849d89ee14f10">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">sim_data</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">group</span> <span class="op">==</span> <span class="va">res</span><span class="op">$</span><span class="va">group</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/which.min.html">which.max</a></span><span class="op">(</span><span class="va">res</span><span class="op">$</span><span class="va">r</span><span class="op">)</span><span class="op">]</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
@@ -498,7 +504,7 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>It’s just a mathematical fact that if we observe random correlations that are expected to be 0, but have a standard error of 0.2041507, the largest one will be close to 1.</p>
+<p>It’s simply a mathematical fact that if we observe random correlations that are expected to be 0, but have a standard error of 0.2039625, the largest one will be close to 1.</p>
 <p>If we performed regression on this group and interpreted the p-value, we would incorrectly claim this was a statistically significant relation:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/unnamed-chunk-3_d693e4269a89506cd99296f1b700b5cb">
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://broom.tidymodels.org/">broom</a></span><span class="op">)</span></span>
@@ -509,11 +515,11 @@ <h1 class="title">
 <span><span class="co">#&gt; # A tibble: 1 × 5</span></span>
 <span><span class="co">#&gt;   term  estimate std.error statistic    p.value</span></span>
 <span><span class="co">#&gt;   &lt;chr&gt;    &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;      &lt;dbl&gt;</span></span>
-<span><span class="co">#&gt; 1 x        0.690     0.112      6.16 0.00000274</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt; 1 x        0.971     0.154      6.30 0.00000198</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This particular form of data dredging is referred to as <em>p-hacking</em>. P-hacking is a topic of much discussion because it is a problem in scientific publications. Because publishers tend to reward statistically significant results over negative results, there is an incentive to report significant results. In epidemiology and the social sciences, for example, researchers may look for associations between an adverse outcome and several exposures and report only the one exposure that resulted in a small p-value. Furthermore, they might try fitting several different models to account for confounding and pick the one that yields the smallest p-value. In experimental disciplines, an experiment might be repeated more than once, yet only the results of the one experiment with a small p-value reported. This does not necessarily happen due to unethical behavior, but rather as a result of statistical ignorance or wishful thinking. In advanced statistics courses, you can learn methods to adjust for these multiple comparisons.</p>
-</section><section id="outliers" class="level2" data-number="18.2"><h2 data-number="18.2" class="anchored" data-anchor-id="outliers">
-<span class="header-section-number">18.2</span> Outliers</h2>
+<p>This particular form of data dredging is referred to as <em>p-hacking</em>. P-hacking is a topic of much discussion because it poses a problem in scientific publications. Since publishers tend to reward statistically significant results over negative results, there is an incentive to report significant results. In epidemiology and the social sciences, for example, researchers may look for associations between an adverse outcome and several exposures, and report only the one exposure that resulted in a small p-value. Furthermore, they might try fitting several different models to account for confounding and choose the one that yields the smallest p-value. In experimental disciplines, an experiment might be repeated more than once, yet only the results of the one experiment with a small p-value reported. This does not necessarily happen due to unethical behavior, but rather as a result of statistical ignorance or wishful thinking. In advanced statistics courses, you can learn methods to adjust for these multiple comparisons.</p>
+</section><section id="outliers" class="level2" data-number="19.2"><h2 data-number="19.2" class="anchored" data-anchor-id="outliers">
+<span class="header-section-number">19.2</span> Outliers</h2>
 <p>Suppose we take measurements from two independent outcomes, <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span>, and we standardize the measurements. However, imagine we make a mistake and forget to standardize entry 23. We can simulate such data using:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/unnamed-chunk-4_b73acb85dead44ad7686c41d37b56859">
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1985</span><span class="op">)</span></span>
@@ -553,7 +559,7 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>The outlier is no longer associated with a very large value and the correlation comes way down:</p>
+<p>The outlier is no longer associated with a very large value, and the correlation decreases significantly:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/unnamed-chunk-7_b66a16f8908009b537ea18aba8c796f0">
 <div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/rank.html">rank</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/rank.html">rank</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.00251</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -564,8 +570,8 @@ <h1 class="title">
 <span><span class="co">#&gt; [1] 0.00251</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>There are also methods for robust fitting of linear models which you can learn about in, for instance, this book: Robust Statistics: Edition 2 by Peter J. Huber &amp; Elvezio M. Ronchetti.</p>
-</section><section id="reversing-cause-and-effect" class="level2" data-number="18.3"><h2 data-number="18.3" class="anchored" data-anchor-id="reversing-cause-and-effect">
-<span class="header-section-number">18.3</span> Reversing cause and effect</h2>
+</section><section id="reversing-cause-and-effect" class="level2" data-number="19.3"><h2 data-number="19.3" class="anchored" data-anchor-id="reversing-cause-and-effect">
+<span class="header-section-number">19.3</span> Reversing cause and effect</h2>
 <p>Another way association is confused with causation is when the cause and effect are reversed. An example of this is claiming that tutoring makes students perform worse because they test lower than peers that are not tutored. In this case, the tutoring is not causing the low test scores, but the other way around.</p>
 <p>A form of this claim actually made it into an op-ed in the New York Times titled Parental Involvement Is Overrated<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>. Consider this quote from the article:</p>
 <blockquote class="blockquote">
@@ -576,7 +582,7 @@ <h1 class="title">
 <p>A very likely possibility is that the children needing regular parental help, receive this help because they don’t perform well in school.</p>
 <p>We can easily construct an example of cause and effect reversal using the father and son height data. If we fit the model:</p>
 <p><span class="math display">\[X_i = \beta_0 + \beta_1 y_i + \varepsilon_i, i=1, \dots, N\]</span></p>
-<p>to the father and son height data, with <span class="math inline">\(X_i\)</span> the father height and <span class="math inline">\(y_i\)</span> the son height, we do get a statistically significant result. We use the <code>galton_heights</code> dataset defined in Chapter <a href="regression.html"><span>Chapter&nbsp;13</span></a>:</p>
+<p>to the father and son height data, where <span class="math inline">\(X_i\)</span> is the father height and <span class="math inline">\(y_i\)</span> is the son height, we do get a statistically significant result. We use the <code>galton_heights</code> dataset defined in <a href="regression.html"><span>Chapter&nbsp;14</span></a>:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/unnamed-chunk-10_113ae7f89226a409d2dd26cdbd54ba27">
 <div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">galton_heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span><span class="fu"><a href="https://generics.r-lib.org/reference/tidy.html">tidy</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">father</span> <span class="op">~</span> <span class="va">son</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; Warning: Returning more (or less) than 1 row per `summarise()` group was</span></span>
@@ -591,14 +597,14 @@ <h1 class="title">
 <span><span class="co">#&gt; 1 (Intercept)   40.9      4.40        9.29 5.47e-17</span></span>
 <span><span class="co">#&gt; 2 son            0.407    0.0636      6.40 1.36e- 9</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The model fits the data very well. If we look at the mathematical formulation of the model above, it could easily be incorrectly interpreted so as to suggest that the son being tall caused the father to be tall. But given what we know about genetics and biology, we know it’s the other way around. The model is technically correct. The estimates and p-values were obtained correctly as well. What is wrong here is the interpretation.</p>
-</section><section id="confounders" class="level2" data-number="18.4"><h2 data-number="18.4" class="anchored" data-anchor-id="confounders">
-<span class="header-section-number">18.4</span> Confounders</h2>
+<p>The model fits the data very well. If we examine the mathematical formulation of the model above, it could easily be misinterpreted so as to suggest that the son being tall caused the father to be tall. However, based on our understanding of genetics and biology, we know it’s the other way around. The model is technically correct. The estimates and p-values were obtained correctly as well. What is wrong here is the interpretation.</p>
+</section><section id="confounders" class="level2" data-number="19.4"><h2 data-number="19.4" class="anchored" data-anchor-id="confounders">
+<span class="header-section-number">19.4</span> Confounders</h2>
 <p>Confounders are perhaps the most common reason that leads to associations begin misinterpreted.</p>
-<p>If <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are correlated, we call <span class="math inline">\(Z\)</span> a <em>confounder</em> if changes in <span class="math inline">\(Z\)</span> causes changes in both <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span>. Earlier, when studying baseball data, we saw how Home Runs was a confounder that resulted in a higher correlation than expected when studying the relationship between Bases on Balls and Runs. In some cases, we can use linear models to account for confounders. However, this is not always the case.</p>
+<p>If <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are correlated, we call <span class="math inline">\(Z\)</span> a <em>confounder</em> if changes in <span class="math inline">\(Z\)</span> cause changes in both <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span>. Earlier, when studying baseball data, we saw how Home Runs were a confounder that resulted in a higher correlation than expected when studying the relationship between Bases on Balls and Runs. In some cases, we can use linear models to account for confounders. However, this is not always the case.</p>
 <p>Incorrect interpretation due to confounders is ubiquitous in the lay press and they are often hard to detect. Here, we present a widely used example related to college admissions.</p>
-<section id="example-uc-berkeley-admissions" class="level3" data-number="18.4.1"><h3 data-number="18.4.1" class="anchored" data-anchor-id="example-uc-berkeley-admissions">
-<span class="header-section-number">18.4.1</span> Example: UC Berkeley admissions</h3>
+<section id="example-uc-berkeley-admissions" class="level3" data-number="19.4.1"><h3 data-number="19.4.1" class="anchored" data-anchor-id="example-uc-berkeley-admissions">
+<span class="header-section-number">19.4.1</span> Example: UC Berkeley admissions</h3>
 <p>Admission data from six U.C. Berkeley majors, from 1973, showed that more men were being admitted than women: 44% men were admitted compared to 30% women. PJ Bickel, EA Hammel, and JW O’Connell. Science (1975). We can load the data and compute a statistical test, which clearly rejects the hypothesis that gender and admission are independent:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/unnamed-chunk-11_93c62b722b2f577942836e558c7ca91e">
 <div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">two_by_two</span> <span class="op">&lt;-</span> <span class="va">admissions</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">gender</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
@@ -626,7 +632,7 @@ <h1 class="title">
 </div>
 <p>Four out of the six majors favor women. More importantly, all the differences are much smaller than the 14.2 difference that we see when examining the totals.</p>
 <p>The paradox is that analyzing the totals suggests a dependence between admission and gender, but when the data is grouped by major, this dependence seems to disappear. What’s going on? This actually can happen if an uncounted confounder is driving most of the variability.</p>
-<p>So let’s define three variables: <span class="math inline">\(X\)</span> is 1 for men and 0 for women, <span class="math inline">\(Y\)</span> is 1 for those admitted and 0 otherwise, and <span class="math inline">\(Z\)</span> quantifies the selectivity of the major. A gender bias claim would be based on the fact that <span class="math inline">\(\mbox{Pr}(Y=1 | X = x)\)</span> is higher for <span class="math inline">\(x=1\)</span> than <span class="math inline">\(x=0\)</span>. However, <span class="math inline">\(Z\)</span> is an important confounder to consider. Clearly <span class="math inline">\(Z\)</span> is associated with <span class="math inline">\(Y\)</span>, as the more selective a major, the lower <span class="math inline">\(\mbox{Pr}(Y=1 | Z = z)\)</span>. But is major selectivity <span class="math inline">\(Z\)</span> associated with gender <span class="math inline">\(X\)</span>?</p>
+<p>So let’s define three variables: <span class="math inline">\(X\)</span> is 1 for men and 0 for women, <span class="math inline">\(Y\)</span> is 1 for those admitted and 0 otherwise, and <span class="math inline">\(Z\)</span> quantifies the selectivity of the major. A gender bias claim would be based on the fact that <span class="math inline">\(\mbox{Pr}(Y=1 | X = x)\)</span> is higher for <span class="math inline">\(x=1\)</span> than for <span class="math inline">\(x=0\)</span>. However, <span class="math inline">\(Z\)</span> is an important confounder to consider. Clearly, <span class="math inline">\(Z\)</span> is associated with <span class="math inline">\(Y\)</span>, as the more selective a major, the lower <span class="math inline">\(\mbox{Pr}(Y=1 | Z = z)\)</span>. But is major selectivity <span class="math inline">\(Z\)</span> associated with gender <span class="math inline">\(X\)</span>?</p>
 <p>One way to see this is to plot the total percent admitted to a major versus the percent of women that made up the applicants:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/uc-berkeley-majors_57dd5b55a5dcb2307a3f9b48ddbd21f8">
 <div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">admissions</span> <span class="op">|&gt;</span> </span>
@@ -643,10 +649,10 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>There seems to be association. The plot suggests that women were much more likely to apply to the two “hard” majors: gender and major’s selectivity are confounded. Compare, for example, major B and major E. Major E is much harder to enter than major B and over 60% of applicants to major E were women, while less than 30% of the applicants of major B were women.</p>
-</section><section id="confounding-explained-graphically" class="level3" data-number="18.4.2"><h3 data-number="18.4.2" class="anchored" data-anchor-id="confounding-explained-graphically">
-<span class="header-section-number">18.4.2</span> Confounding explained graphically</h3>
-<p>The following plot shows the number of applicants that were admitted and those that were not by:</p>
+<p>There seems to be association. The plot suggests that women were much more likely to apply to the two “hard” majors, indicating a confounding between gender and major’s selectivity. Compare, for example, major B and major E. Major E is much harder to enter than major B, and over 60% of applicants for major E were women, while less than 30% of the applicants for major B were women.</p>
+</section><section id="confounding-explained-graphically" class="level3" data-number="19.4.2"><h3 data-number="19.4.2" class="anchored" data-anchor-id="confounding-explained-graphically">
+<span class="header-section-number">19.4.2</span> Confounding explained graphically</h3>
+<p>The following plot shows the number of applicants that were admitted and those that were not by major and gender:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/confounding_10ed9bed42cb83d4574856adaff26dbd">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -655,9 +661,9 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>It also breaks down the acceptances by major. This breakdown allows us to see that the majority of accepted men came from two majors: A and B. It also lets us see that few women applied to these majors.</p>
-</section><section id="average-after-stratifying" class="level3" data-number="18.4.3"><h3 data-number="18.4.3" class="anchored" data-anchor-id="average-after-stratifying">
-<span class="header-section-number">18.4.3</span> Average after stratifying</h3>
+<p>It also breaks down the acceptances by major. This breakdown allows us to see that the majority of accepted men came from two majors, A and B. It also reveals that few women applied to these majors.</p>
+</section><section id="average-after-stratifying" class="level3" data-number="19.4.3"><h3 data-number="19.4.3" class="anchored" data-anchor-id="average-after-stratifying">
+<span class="header-section-number">19.4.3</span> Average after stratifying</h3>
 <p>In this plot, we can see that if we condition or stratify by major, and then look at differences, we control for the confounder and this effect goes away:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/admission-by-major_bd8d26325fc0d503af3cd9f22f737be9">
 <div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">admissions</span> <span class="op">|&gt;</span> </span>
@@ -680,9 +686,9 @@ <h1 class="title">
 <span><span class="co">#&gt; 1 men       38.2</span></span>
 <span><span class="co">#&gt; 2 women     41.7</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section></section><section id="simpsons-paradox" class="level2" data-number="18.5"><h2 data-number="18.5" class="anchored" data-anchor-id="simpsons-paradox">
-<span class="header-section-number">18.5</span> Simpson’s paradox</h2>
-<p>The case we have just covered is an example of Simpson’s paradox. It is called a paradox because we see the sign of the correlation flip when comparing the entire publication and specific strata. As an illustrative example, suppose you have three random variables <span class="math inline">\(X\)</span>, <span class="math inline">\(Y\)</span>, and <span class="math inline">\(Z\)</span> and that we observe realizations of these. Here is a plot of simulated observations for <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> along with the sample correlation:</p>
+</section></section><section id="simpsons-paradox" class="level2" data-number="19.5"><h2 data-number="19.5" class="anchored" data-anchor-id="simpsons-paradox">
+<span class="header-section-number">19.5</span> Simpson’s paradox</h2>
+<p>The case we have just covered is an example of Simpson’s paradox. It is called a paradox because we see the sign of the correlation flip when comparing the entire publication to specific strata. As an illustrative example, suppose you have three random variables <span class="math inline">\(X\)</span>, <span class="math inline">\(Y\)</span>, and <span class="math inline">\(Z\)</span>, and we observe realizations of these. Here is a plot of simulated observations for <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> along with the sample correlation:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/simpsons-paradox_4f82fcec0f445b9733539c67a9beb88d">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -691,7 +697,7 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>You can see that <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are negatively correlated. However, once we stratify by <span class="math inline">\(Z\)</span> (shown in different colors below) another pattern emerges:</p>
+<p>You can see that <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are negatively correlated. However, once we stratify by <span class="math inline">\(Z\)</span> (shown in different colors below), another pattern emerges:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/simpsons-paradox-explained_0411051bf180e137d9c5e0cc41cb4979">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -700,9 +706,9 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>It is really <span class="math inline">\(Z\)</span> that is negatively correlated with <span class="math inline">\(X\)</span>. If we stratify by <span class="math inline">\(Z\)</span>, the <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are actually positively correlated as seen in the plot above.</p>
-</section><section id="exercises" class="level2" data-number="18.6"><h2 data-number="18.6" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">18.6</span> Exercises</h2>
+<p>It is really <span class="math inline">\(Z\)</span> that is negatively correlated with <span class="math inline">\(X\)</span>. If we stratify by <span class="math inline">\(Z\)</span>, the <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are actually positively correlated, as seen in the plot above.</p>
+</section><section id="exercises" class="level2" data-number="19.6"><h2 data-number="19.6" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">19.6</span> Exercises</h2>
 <p>For the next set of exercises, we examine the data from a 2014 PNAS paper<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a> that analyzed success rates from funding agencies in the Netherlands and concluded:</p>
 <blockquote class="blockquote">
 <p>Our results reveal gender bias favoring male applicants over female applicants in the prioritization of their “quality of researcher” (but not “quality of proposal”) evaluations and success rates, as well as in the language used in instructional and evaluation materials.</p>
@@ -711,8 +717,8 @@ <h1 class="title">
 <blockquote class="blockquote">
 <p>However, the overall gender effect borders on statistical significance, despite the large sample. Moreover, their conclusion could be a prime example of Simpson’s paradox; if a higher percentage of women apply for grants in more competitive scientific disciplines (i.e., with low application success rates for both men and women), then an analysis across all disciplines could incorrectly show “evidence” of gender inequality.</p>
 </blockquote>
-<p>Who is right here? The original paper or the response? Here, you will examine the data and come to your own conclusion.</p>
-<p>1. The main evidence for the conclusion of the original paper comes down to a comparison of the percentages. Table S1 in the paper includes the information we need:</p>
+<p>Who is correct here, the original paper or the response? Below, you will examine the data and come to your own conclusion.</p>
+<p>1. The primary evidence for the conclusion of the original paper relies on a comparison of the percentages. Table S1 in the paper includes the information we need:</p>
 <div class="cell" data-layout-align="center" data-hash="association-not-causation_cache/html/unnamed-chunk-14_3d85d94b5e3fe2d7a01bb8bbe4a9def8">
 <div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
 <span><span class="va">research_funding_rates</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -720,11 +726,11 @@ <h1 class="title">
 <p>Construct the two-by-two table used for the conclusion about differences in awards by gender.</p>
 <p>2. Compute the difference in percentage from the two-by-two table.</p>
 <p>3. In the previous exercise, we noticed that the success rate is lower for women. But is it significant? Compute a p-value using a Chi-square test.</p>
-<p>4. We see that the p-value is about 0.05. So there appears to be some evidence of an association. But can we infer causation here? Is gender bias causing this observed difference? The response to the original paper claims that what we see here is similar to the UC Berkeley admissions example. Specifically they state that this “could be a prime example of Simpson’s paradox; if a higher percentage of women apply for grants in more competitive scientific disciplines, then an analysis across all disciplines could incorrectly show ‘evidence’ of gender inequality.” To settle this dispute, create a dataset with number of applications, awards, and success rate for each gender. Re-order the disciplines by their overall success rate. Hint: use the <code>reorder</code> function to re-order the disciplines in a first step, then use <code>pivot_longer</code>, <code>separate</code>, and <code>pivot_wider</code> to create the desired table.</p>
+<p>4. We see that the p-value is about 0.05. So there appears to be some evidence of an association. But can we infer causation here? Is gender bias causing this observed difference? The response to the original paper claims that what we see here is similar to the UC Berkeley admissions example. Specifically, they state that this “could be a prime example of Simpson’s paradox; if a higher percentage of women apply for grants in more competitive scientific disciplines, then an analysis across all disciplines could incorrectly show ‘evidence’ of gender inequality.” To settle this dispute, create a dataset with number of applications, awards, and success rate for each gender. Re-order the disciplines by their overall success rate. Hint: use the <code>reorder</code> function to re-order the disciplines in a first step, then use <code>pivot_longer</code>, <code>separate</code>, and <code>pivot_wider</code> to create the desired table.</p>
 <p>5. To check if this is a case of Simpson’s paradox, plot the success rates versus disciplines, which have been ordered by overall success, with colors to denote the genders and size to denote the number of applications.</p>
-<p>6. We definitely do not see the same level of confounding as in the UC Berkeley example. It is hard to say there is a confounder here. However, we do see that, based on the observed rates, some fields favor men and others favor women and we do see that the two fields with the largest difference favoring men are also the fields with the most applications. But, unlike the UC Berkeley example, women are not more likely to apply for the harder subjects. So perhaps some of the selection committees are biased and others are not.</p>
-<p>But, before we conclude this, we must check if these differences are any different than what we get by chance. Are any of the differences seen above statistically significant? Keep in mind that even when there is no bias, we will see differences due to random variability in the review process as well as random variability across candidates. Perform a Chi-square test for each discipline. Hint: define a function that receives the total of a two-by-two table and returns a data frame with the p-value. Use the 0.5 correction. Then use the <code>summarize</code> function.</p>
-<p>7. For the medical sciences, there appears to be a statistically significant difference. But is this a spurious correlation? We performed 9 tests. Reporting only the one case with a p-value less than 0.05 might be considered an example of cherry picking. Repeat the exercise above, but instead of a p-value, compute a log odds ratio divided by their standard error. Then use qq-plot to see how much these log odds ratios deviate from the normal distribution we would expect: a standard normal distribution.</p>
+<p>6. We definitely do not see the same level of confounding as in the UC Berkeley example. It is hard to say that there is a clear confounder here. However, we do see that, based on the observed rates, some fields favor men and others favor women. We also see that the two fields with the largest difference favoring men are also the fields with the most applications. But, unlike the UC Berkeley example, women are not more likely to apply for the harder subjects. Is it possible some of the selection committees are biased and others are not?</p>
+<p>To answer this question we start by checking if any of the differences seen above are statistically significant. Remember that even when there is no bias, we will see differences due to random variability in the review process as well as random variability across candidates. Perform a Chi-square test for each discipline. Hint: define a function that receives the total of a two-by-two table and returns a data frame with the p-value. Use the 0.5 correction. Then use the <code>summarize</code> function.</p>
+<p>7. In the medical sciences, there appears to be a statistically significant difference, but could this be a spurious correlation? We performed 9 tests. Reporting only the one case with a p-value less than 0.05 might be considered an example of cherry picking. Repeat the exercise above, but instead of a p-value, compute a log odds ratio divided by their standard error. Then use qq-plot to see how much these log odds ratios deviate from the normal distribution we would expect: a standard normal distribution.</p>
 
 
 </section><section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes"><hr>
@@ -967,7 +973,7 @@ <h1 class="title">
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../linear-models/association-tests.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
diff --git a/docs/linear-models/association-not-causation_files/figure-html/dredging-1.png b/docs/linear-models/association-not-causation_files/figure-html/dredging-1.png
index 19e34e5..6e26b83 100644
Binary files a/docs/linear-models/association-not-causation_files/figure-html/dredging-1.png and b/docs/linear-models/association-not-causation_files/figure-html/dredging-1.png differ
diff --git a/docs/linear-models/association-not-causation_files/figure-html/null-corr-hist-1.png b/docs/linear-models/association-not-causation_files/figure-html/null-corr-hist-1.png
index cab9a77..d6d785d 100644
Binary files a/docs/linear-models/association-not-causation_files/figure-html/null-corr-hist-1.png and b/docs/linear-models/association-not-causation_files/figure-html/null-corr-hist-1.png differ
diff --git a/docs/linear-models/association-tests.html b/docs/linear-models/association-tests.html
index 96ae403..560ca07 100644
--- a/docs/linear-models/association-tests.html
+++ b/docs/linear-models/association-tests.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 17&nbsp; Association tests</title>
+<title>Advanced Data Science - 18&nbsp; Association tests</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -99,7 +99,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../linear-models/intro-to-linear-models.html">Linear Models</a></li><li class="breadcrumb-item"><a href="../linear-models/association-tests.html"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../linear-models/intro-to-linear-models.html">Linear Models</a></li><li class="breadcrumb-item"><a href="../linear-models/association-tests.html"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -224,23 +224,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -257,37 +263,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -304,31 +310,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -345,49 +351,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -400,25 +406,25 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#case-study-funding-success-rates" id="toc-case-study-funding-success-rates" class="nav-link active" data-scroll-target="#case-study-funding-success-rates"><span class="header-section-number">17.1</span> Case study: funding success rates</a></li>
-  <li><a href="#lady-tasting-tea" id="toc-lady-tasting-tea" class="nav-link" data-scroll-target="#lady-tasting-tea"><span class="header-section-number">17.2</span> Lady Tasting Tea</a></li>
-  <li><a href="#two-by-two-tables" id="toc-two-by-two-tables" class="nav-link" data-scroll-target="#two-by-two-tables"><span class="header-section-number">17.3</span> Two-by-two tables</a></li>
-  <li><a href="#chi-square-test" id="toc-chi-square-test" class="nav-link" data-scroll-target="#chi-square-test"><span class="header-section-number">17.4</span> Chi-square Test</a></li>
+<li><a href="#case-study-funding-success-rates" id="toc-case-study-funding-success-rates" class="nav-link active" data-scroll-target="#case-study-funding-success-rates"><span class="header-section-number">18.1</span> Case study: Funding success rates</a></li>
+  <li><a href="#lady-tasting-tea" id="toc-lady-tasting-tea" class="nav-link" data-scroll-target="#lady-tasting-tea"><span class="header-section-number">18.2</span> Lady Tasting Tea</a></li>
+  <li><a href="#two-by-two-tables" id="toc-two-by-two-tables" class="nav-link" data-scroll-target="#two-by-two-tables"><span class="header-section-number">18.3</span> Two-by-two tables</a></li>
+  <li><a href="#chi-square-test" id="toc-chi-square-test" class="nav-link" data-scroll-target="#chi-square-test"><span class="header-section-number">18.4</span> Chi-square Test</a></li>
   <li>
-<a href="#sec-glm" id="toc-sec-glm" class="nav-link" data-scroll-target="#sec-glm"><span class="header-section-number">17.5</span> Generalized linear models</a>
+<a href="#sec-glm" id="toc-sec-glm" class="nav-link" data-scroll-target="#sec-glm"><span class="header-section-number">18.5</span> Generalized linear models</a>
   <ul class="collapse">
-<li><a href="#sec-odds-ratio" id="toc-sec-odds-ratio" class="nav-link" data-scroll-target="#sec-odds-ratio"><span class="header-section-number">17.5.1</span> The odds ratio</a></li>
-  <li><a href="#fitting-the-model" id="toc-fitting-the-model" class="nav-link" data-scroll-target="#fitting-the-model"><span class="header-section-number">17.5.2</span> Fitting the model</a></li>
-  <li><a href="#simple-standard-error-approximation-for-two-by-two-table-odds-ratio" id="toc-simple-standard-error-approximation-for-two-by-two-table-odds-ratio" class="nav-link" data-scroll-target="#simple-standard-error-approximation-for-two-by-two-table-odds-ratio"><span class="header-section-number">17.5.3</span> Simple standard error approximation for two-by-two table odds ratio</a></li>
+<li><a href="#sec-odds-ratio" id="toc-sec-odds-ratio" class="nav-link" data-scroll-target="#sec-odds-ratio"><span class="header-section-number">18.5.1</span> The odds ratio</a></li>
+  <li><a href="#fitting-the-model" id="toc-fitting-the-model" class="nav-link" data-scroll-target="#fitting-the-model"><span class="header-section-number">18.5.2</span> Fitting the model</a></li>
+  <li><a href="#simple-standard-error-approximation-for-two-by-two-table-odds-ratio" id="toc-simple-standard-error-approximation-for-two-by-two-table-odds-ratio" class="nav-link" data-scroll-target="#simple-standard-error-approximation-for-two-by-two-table-odds-ratio"><span class="header-section-number">18.5.3</span> Simple standard error approximation for two-by-two table odds ratio</a></li>
   </ul>
 </li>
-  <li><a href="#large-samples-small-p-values" id="toc-large-samples-small-p-values" class="nav-link" data-scroll-target="#large-samples-small-p-values"><span class="header-section-number">17.6</span> Large samples, small p-values</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">17.7</span> Exercises</a></li>
+  <li><a href="#large-samples-small-p-values" id="toc-large-samples-small-p-values" class="nav-link" data-scroll-target="#large-samples-small-p-values"><span class="header-section-number">18.6</span> Large samples, small p-values</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">18.7</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/linear-models/association-tests.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
-<h1 class="title"><span id="sec-association-tests" class="quarto-section-identifier"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></h1>
+<h1 class="title"><span id="sec-association-tests" class="quarto-section-identifier"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></h1>
 </div>
 
 
@@ -431,14 +437,14 @@ <h1 class="title"><span id="sec-association-tests" class="quarto-section-identif
   </div>
   
 
-</header><p>The statistical models studied up to now are appropriate for continuous outcomes. We have not yet discussed inference for binary, categorical, and ordinal data. To give a very specific example, we will consider a case study examining funding success rates in the Netherlands, by gender.</p>
-<section id="case-study-funding-success-rates" class="level2" data-number="17.1"><h2 data-number="17.1" class="anchored" data-anchor-id="case-study-funding-success-rates">
-<span class="header-section-number">17.1</span> Case study: funding success rates</h2>
+</header><p>The statistical models studied up to now are appropriate for continuous outcomes. We have not yet discussed inference for binary, categorical, and ordinal data. To give a very specific example, we will consider a case study examining funding success rates in the Netherlands, categorized by gender.</p>
+<section id="case-study-funding-success-rates" class="level2" data-number="18.1"><h2 data-number="18.1" class="anchored" data-anchor-id="case-study-funding-success-rates">
+<span class="header-section-number">18.1</span> Case study: Funding success rates</h2>
 <p>A 2014 PNAS paper<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> analyzed success rates from funding agencies in the Netherlands and concluded that their:</p>
 <blockquote class="blockquote">
 <p>results reveal gender bias favoring male applicants over female applicants in the prioritization of their “quality of researcher” (but not “quality of proposal”) evaluations and success rates, as well as in the language use in instructional and evaluation materials.</p>
 </blockquote>
-<p>The main evidence for this conclusion comes down to a comparison of the percentages. Table S1 in the paper includes the information we need. Here are the three columns showing the overall outcomes:</p>
+<p>The main evidence supporting this conclusion is based on a comparison of the percentages. Table S1 in the paper includes the information we need. Here are the three columns showing the overall outcomes:</p>
 <div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-3_0771ce7c2fbdac9108750551602bab74">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
 <span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
@@ -471,106 +477,139 @@ <h1 class="title"><span id="sec-association-tests" class="quarto-section-identif
 <span>            no_women <span class="op">=</span> <span class="va">applications_women</span> <span class="op">-</span> <span class="va">awards_women</span><span class="op">)</span> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>So we see that a larger percent of men than women received awards:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-6_d73cf17bcb701f3354368eb1780aedba">
-<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">totals</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>percent_men <span class="op">=</span> <span class="va">yes_men</span><span class="op">/</span><span class="op">(</span><span class="va">yes_men</span><span class="op">+</span><span class="va">no_men</span><span class="op">)</span>,</span>
-<span>                    percent_women <span class="op">=</span> <span class="va">yes_women</span><span class="op">/</span><span class="op">(</span><span class="va">yes_women</span><span class="op">+</span><span class="va">no_women</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-6_121c10b341f9455870b966b71b7155da">
+<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">totals</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>percent_men <span class="op">=</span> <span class="va">yes_men</span><span class="op">/</span><span class="op">(</span><span class="va">yes_men</span> <span class="op">+</span> <span class="va">no_men</span><span class="op">)</span>,</span>
+<span>                    percent_women <span class="op">=</span> <span class="va">yes_women</span><span class="op">/</span><span class="op">(</span><span class="va">yes_women</span> <span class="op">+</span> <span class="va">no_women</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt;   percent_men percent_women</span></span>
 <span><span class="co">#&gt; 1       0.177         0.149</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>But could this be due just to random variability? Here we learn how to perform inference for this type of data.</p>
-</section><section id="lady-tasting-tea" class="level2" data-number="17.2"><h2 data-number="17.2" class="anchored" data-anchor-id="lady-tasting-tea">
-<span class="header-section-number">17.2</span> Lady Tasting Tea</h2>
+</section><section id="lady-tasting-tea" class="level2" data-number="18.2"><h2 data-number="18.2" class="anchored" data-anchor-id="lady-tasting-tea">
+<span class="header-section-number">18.2</span> Lady Tasting Tea</h2>
 <p>R.A. Fisher<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a> was one of the first to formalize hypothesis testing. The “Lady Tasting Tea” is one of the most famous examples.</p>
-<p>The story is as follows: an acquaintance of Fisher’s claimed that she could tell if milk was added before or after tea was poured. Fisher was skeptical. He designed an experiment to test this claim. He gave her four pairs of cups of tea: one with milk poured first, the other after. The order was randomized. The null hypothesis here is that she is guessing. Fisher derived the distribution for the number of correct picks on the assumption that the choices were random and independent.</p>
-<p>As an example, suppose she picked 3 out of 4 correctly. Do we believe she has a special ability? The basic question we ask is: if the tester is actually guessing, what are the chances that she gets 3 or more correct? Just as we have done before, we can compute a probability under the null hypothesis that she is guessing 4 of each. Under this null hypothesis, we can think of this particular example as picking 4 balls out of an urn with 4 blue (correct answer) and 4 red (incorrect answer) balls. Remember, she knows that there are four before tea and four after.</p>
-<p>Under the null hypothesis that she is simply guessing, each ball has the same chance of being picked. We can then use combinations to figure out each probability. The probability of picking 3 is <span class="math inline">\(\binom{4}{3} \binom{4}{1} / \binom{8}{4} = 16/70\)</span>. The probability of picking all 4 correct is <span class="math inline">\(\binom{4}{4} \binom{4}{0} / \binom{8}{4}= 1/70\)</span>. Thus, the chance of observing a 3 or something more extreme, under the null hypothesis, is <span class="math inline">\(\approx 0.24\)</span>. This is the p-value. The procedure that produced this p-value is called <em>Fisher’s exact test</em> and it uses the <em>hypergeometric distribution</em>.</p>
-</section><section id="two-by-two-tables" class="level2" data-number="17.3"><h2 data-number="17.3" class="anchored" data-anchor-id="two-by-two-tables">
-<span class="header-section-number">17.3</span> Two-by-two tables</h2>
+<p>The story is as follows: an acquaintance of Fisher’s claimed that she could tell if milk was added before or after tea was poured. Fisher was skeptical and, consequently, designed an experiment to test this claim. He gave her four pairs of cups of tea: one with milk poured first, the other after. The order was randomized. The null hypothesis here is that she is guessing. Fisher derived the distribution for the number of correct picks on the assumption that the choices were random and independent.</p>
+<p>As an example, suppose she identified 3 out of 4 correctly. Do we believe she has a special ability? The basic question we ask is: if the tester is actually guessing, what are the chances that she gets 3 or more correct? Just as we have done before, we can compute a probability under the null hypothesis that she is guessing for all 4. Under this null hypothesis, we can think of this particular example as picking 4 balls out of an urn with 4 blue (correct answer) and 4 red (incorrect answer) balls. Remember, she knows that there are four before tea and four after.</p>
+<p>Under the null hypothesis that she is simply guessing, each ball has the same chance of being picked. We can then use combinations to determine each probability. The probability of picking 3 is <span class="math inline">\(\binom{4}{3} \binom{4}{1} / \binom{8}{4} = 16/70\)</span>. The probability of picking all 4 correct is <span class="math inline">\(\binom{4}{4} \binom{4}{0} / \binom{8}{4}= 1/70\)</span>. Thus, the chance of observing a 3 or something more extreme, under the null hypothesis, is <span class="math inline">\(\approx 0.24\)</span>. This is the p-value. The procedure that produced this p-value is called <em>Fisher’s exact test</em> and it uses the <em>hypergeometric distribution</em>.</p>
+</section><section id="two-by-two-tables" class="level2" data-number="18.3"><h2 data-number="18.3" class="anchored" data-anchor-id="two-by-two-tables">
+<span class="header-section-number">18.3</span> Two-by-two tables</h2>
 <p>The data from the experiment is usually summarized by a table like this:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-7_3a18b0d4d34de291469e63bcfbd244ff">
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-7_098836397e0140ee1a6cd84c09641c10">
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">tab</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">3</span>,<span class="fl">1</span>,<span class="fl">1</span>,<span class="fl">3</span><span class="op">)</span>,<span class="fl">2</span>,<span class="fl">2</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">rownames</a></span><span class="op">(</span><span class="va">tab</span><span class="op">)</span><span class="op">&lt;-</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Poured Before"</span>,<span class="st">"Poured After"</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">tab</span><span class="op">)</span><span class="op">&lt;-</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Guessed before"</span>,<span class="st">"Guessed after"</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">rownames</a></span><span class="op">(</span><span class="va">tab</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Poured Before"</span>, <span class="st">"Poured After"</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">tab</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Guessed before"</span>, <span class="st">"Guessed after"</span><span class="op">)</span></span>
 <span><span class="va">tab</span></span>
 <span><span class="co">#&gt;               Guessed before Guessed after</span></span>
 <span><span class="co">#&gt; Poured Before              3             1</span></span>
 <span><span class="co">#&gt; Poured After               1             3</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>These are referred to as a two-by-two table. For each of the four combinations one can get with a pair of binary variables, they show the observed counts for each occurrence.</p>
+<p>These are referred to as a two-by-two table. For each of the four combinations can result from a pair of binary variables, they display the observed counts for each occurrence.</p>
 <p>The function <code>fisher.test</code> performs the inference calculations above:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-8_036afe458f4a9f03bfa67e118f10682d">
-<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/fisher.test.html">fisher.test</a></span><span class="op">(</span><span class="va">tab</span>, alternative<span class="op">=</span><span class="st">"greater"</span><span class="op">)</span><span class="op">$</span><span class="va">p.value</span></span>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-8_7664e6a50abeedb9f4eb9114180322c1">
+<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/fisher.test.html">fisher.test</a></span><span class="op">(</span><span class="va">tab</span>, alternative <span class="op">=</span> <span class="st">"greater"</span><span class="op">)</span><span class="op">$</span><span class="va">p.value</span></span>
 <span><span class="co">#&gt; [1] 0.243</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="chi-square-test" class="level2" data-number="17.4"><h2 data-number="17.4" class="anchored" data-anchor-id="chi-square-test">
-<span class="header-section-number">17.4</span> Chi-square Test</h2>
-<p>Notice that, in a way, our funding rates example is similar to the Lady Tasting Tea. However, in the Lady Tasting Tea example, the number of blue and red beads is experimentally fixed and the number of answers given for each category is also fixed. This is because Fisher made sure there were four cups with milk poured before tea and four cups with milk poured after and the lady knew this, so the answers would also have to include four before and four afters. If this is the case, the sum of the rows and the sum of the columns are fixed. This defines constraints on the possible ways we can fill the two by two table and also permits us to use the hypergeometric distribution. In general, this is not the case. Nonetheless, there is another approach, the Chi-squared test, which is described below.</p>
-<p>Imagine we have 290, 1,345, 177, 1,011 applicants, some are men and some are women and some get funded, whereas others don’t. We saw that the success rates for men and woman were:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-9_446e76725ec4a455184c5a260c704cc5">
-<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">totals</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>percent_men <span class="op">=</span> <span class="va">yes_men</span><span class="op">/</span><span class="op">(</span><span class="va">yes_men</span><span class="op">+</span><span class="va">no_men</span><span class="op">)</span>,</span>
-<span>                    percent_women <span class="op">=</span> <span class="va">yes_women</span><span class="op">/</span><span class="op">(</span><span class="va">yes_women</span><span class="op">+</span><span class="va">no_women</span><span class="op">)</span><span class="op">)</span></span>
+</section><section id="chi-square-test" class="level2" data-number="18.4"><h2 data-number="18.4" class="anchored" data-anchor-id="chi-square-test">
+<span class="header-section-number">18.4</span> Chi-square Test</h2>
+<p>Notice that, in a sense, our funding rates example is similar to the Lady Tasting Tea. However, in the Lady Tasting Tea example, the number of blue and red beads is experimentally fixed and the number of answers given for each category is also fixed. This is because Fisher ensured there were four cups with milk poured before tea and four cups with milk poured after, and the lady knew this. Therefore, the answers would also have to include four before and four afters. In this case, the sum of the rows and the sum of the columns are fixed. This defines constraints on the possible ways we can fill the two by two table and also allows us to use the hypergeometric distribution. In general, this is not the case. Nonetheless, there is another approach, the Chi-squared test, which is described below.</p>
+<p>Imagine we have a total of 290, 1,345, 177, 1,011 applicants– some are men and some are women, and some get funded while others do not. We saw that the success rates for men and women respectively were:</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-9_309142422dfffeff7aa304544a5cab02">
+<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">totals</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>percent_men <span class="op">=</span> <span class="va">yes_men</span><span class="op">/</span><span class="op">(</span><span class="va">yes_men</span> <span class="op">+</span> <span class="va">no_men</span><span class="op">)</span>,</span>
+<span>                    percent_women <span class="op">=</span> <span class="va">yes_women</span><span class="op">/</span><span class="op">(</span><span class="va">yes_women</span> <span class="op">+</span> <span class="va">no_women</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt;   percent_men percent_women</span></span>
 <span><span class="co">#&gt; 1       0.177         0.149</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>respectively. Would we see this again if we randomly assign funding at the overall rate:</p>
+<p>Would we see this again if we randomly assign funding at the overall rate:</p>
 <div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-10_813bdf8f03c7816a1c015516df3ada58">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">rate</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">totals</span>, <span class="op">(</span><span class="va">yes_men</span> <span class="op">+</span> <span class="va">yes_women</span><span class="op">)</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">totals</span><span class="op">)</span></span>
 <span><span class="va">rate</span></span>
 <span><span class="co">#&gt; [1] 0.165</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The Chi-square test answers this question. The first step is to create the two-by-two data table:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-11_56ce1b40abb935ffb84982e070966725">
-<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">two_by_two</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">totals</span>, <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>awarded <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"no"</span>, <span class="st">"yes"</span><span class="op">)</span>, </span>
-<span>                                      men <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">no_men</span>, <span class="va">yes_men</span><span class="op">)</span>,</span>
-<span>                                      women <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">no_women</span>, <span class="va">yes_women</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="va">two_by_two</span></span>
-<span><span class="co">#&gt;   awarded  men women</span></span>
-<span><span class="co">#&gt; 1      no 1345  1011</span></span>
-<span><span class="co">#&gt; 2     yes  290   177</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-11_1d9387b4ad1b01306cc2c6bcf20c0bcb">
+<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">o</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">totals</span>, <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>men <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">no_men</span>, <span class="va">yes_men</span><span class="op">)</span>,</span>
+<span>                             women <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">no_women</span>, <span class="va">yes_women</span><span class="op">)</span>,</span>
+<span>                             row.names <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"no"</span>, <span class="st">"yes"</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">o</span></span>
+<span><span class="co">#&gt;      men women</span></span>
+<span><span class="co">#&gt; no  1345  1011</span></span>
+<span><span class="co">#&gt; yes  290   177</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The general idea of the Chi-square test is to compare this two-by-two table to what you expect to see, which would be:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-12_bc2a0b91ac33db4ab4c9cb45ffc1a0da">
-<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">totals</span>, <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>awarded <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"no"</span>, <span class="st">"yes"</span><span class="op">)</span>, </span>
-<span>                        men <span class="op">=</span> <span class="op">(</span><span class="va">no_men</span> <span class="op">+</span> <span class="va">yes_men</span><span class="op">)</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">rate</span>, <span class="va">rate</span><span class="op">)</span>,</span>
-<span>                        women <span class="op">=</span> <span class="op">(</span><span class="va">no_women</span> <span class="op">+</span> <span class="va">yes_women</span><span class="op">)</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">rate</span>, <span class="va">rate</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="co">#&gt;   awarded  men women</span></span>
-<span><span class="co">#&gt; 1      no 1365   991</span></span>
-<span><span class="co">#&gt; 2     yes  270   197</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>We can see that more men than expected and fewer women than expected received funding. However, under the null hypothesis these observations are random variables. The Chi-square test tells us how likely it is to see a deviation this large or larger. This test uses an asymptotic result, similar to the CLT, related to the sums of independent binary outcomes. The R function <code>chisq.test</code> takes a two-by-two table and returns the results from the test:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-13_956d12f17b56df26e11339a0d2abc203">
-<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">chisq_test</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/chisq.test.html">chisq.test</a></span><span class="op">(</span><span class="va">two_by_two</span><span class="op">[</span>, <span class="op">-</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>We see that the p-value is 0.0509:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-14_a9895f969628f4bcbf95f1cd20ffd1b4">
-<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">chisq_test</span><span class="op">$</span><span class="va">p.value</span></span>
-<span><span class="co">#&gt; [1] 0.0509</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-</section><section id="sec-glm" class="level2" data-number="17.5"><h2 data-number="17.5" class="anchored" data-anchor-id="sec-glm">
-<span class="header-section-number">17.5</span> Generalized linear models</h2>
-<p>We presented a way to perform hypothesis testing for determining if there is association between two binary outcome. But we have not yet described how to quantify effects. Can we estimate the effect of being a woman in funding success in the Netherlands? Note that if our outcomes are binary, then the linear models presented in the Chapter <a href="treatment-effect-models.html"><span>Chapter&nbsp;16</span></a> are not appropriate because the <span class="math inline">\(\beta\)</span>s and <span class="math inline">\(\varepsilon\)</span> are continuous. However, an adaptation of these methods, that is widely used in, for example, medical studies, gives us a way to estimate effects along with their standard errors.</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-12_2b2d1b7063f2d4836452533cf1158141">
+<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">e</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">totals</span>, <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>men <span class="op">=</span> <span class="op">(</span><span class="va">no_men</span> <span class="op">+</span> <span class="va">yes_men</span><span class="op">)</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">rate</span>, <span class="va">rate</span><span class="op">)</span>,</span>
+<span>                             women <span class="op">=</span> <span class="op">(</span><span class="va">no_women</span> <span class="op">+</span> <span class="va">yes_women</span><span class="op">)</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">rate</span>, <span class="va">rate</span><span class="op">)</span>,</span>
+<span>                             row.names <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"no"</span>, <span class="st">"yes"</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
+<span>                       </span>
+<span><span class="va">e</span></span>
+<span><span class="co">#&gt;      men women</span></span>
+<span><span class="co">#&gt; no  1365   991</span></span>
+<span><span class="co">#&gt; yes  270   197</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>We can see that more men than expected and fewer women than expected received funding. However, under the null hypothesis these observations are random variables. The Chi-square statistic quantifies how much the observed tables deviates from the expected by:</p>
+<ol type="1">
+<li>Taking the difference between each observed and expected cell value.</li>
+<li>Squaring this difference.</li>
+<li>Dividing each squared difference by the expected value.</li>
+<li>Summing all these values together to get the final statistic.</li>
+</ol>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-13_21ff01bc22be43ca0b079dfa06ce331d">
+<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="op">(</span><span class="va">o</span> <span class="op">-</span> <span class="va">e</span><span class="op">)</span><span class="op">^</span><span class="fl">2</span><span class="op">/</span><span class="va">e</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 4.01</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>The Chi-square test tells us how likely it is to see a deviation this large or larger. This test uses an asymptotic result, similar to the CLT, related to the sums of independent binary outcomes. The R function <code>chisq.test</code> takes a two-by-two table and returns the results from the test:</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-14_4ba317e28638e1746139d89c9f1c2f2d">
+<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">chisq_test</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/chisq.test.html">chisq.test</a></span><span class="op">(</span><span class="va">o</span>, correct <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>We see that the p-value is 0.045:</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-15_ba2a870d1a4625d41c67269b6589fe12">
+<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">chisq_test</span><span class="op">$</span><span class="va">p.value</span></span>
+<span><span class="co">#&gt; [1] 0.0451</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>By default, the <code>chisq.test</code> function applies a <em>continuity correction</em>. This correction is particularly useful when a cell in the table has values close to 0, as it prevents low observed values from inflating the statistics. This achieved by subtracting 0.5 in the following way:</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-16_21c1f65ce16b7e5ebc80c27dec546a92">
+<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">abs</a></span><span class="op">(</span><span class="va">o</span> <span class="op">-</span> <span class="va">e</span><span class="op">)</span> <span class="op">-</span> <span class="fl">0.5</span><span class="op">)</span><span class="op">^</span><span class="fl">2</span><span class="op">/</span><span class="va">e</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 3.81</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Note that it matches the default behavior:</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-17_810525542f552532516c226e7d494644">
+<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/chisq.test.html">chisq.test</a></span><span class="op">(</span><span class="va">o</span><span class="op">)</span><span class="op">$</span><span class="va">statistic</span></span>
+<span><span class="co">#&gt; X-squared </span></span>
+<span><span class="co">#&gt;      3.81</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+</div>
+</div>
+</div>
+</section><section id="sec-glm" class="level2" data-number="18.5"><h2 data-number="18.5" class="anchored" data-anchor-id="sec-glm">
+<span class="header-section-number">18.5</span> Generalized linear models</h2>
+<p>We presented a way to perform hypothesis testing for determining if there is association between two binary outcomes. But we have not yet described how to quantify effects. Can we estimate the effect of being a woman in funding success in the Netherlands? Note that if our outcomes are binary, then the linear models presented in the <a href="treatment-effect-models.html"><span>Chapter&nbsp;17</span></a> are not appropriate because the <span class="math inline">\(\beta\)</span>s and <span class="math inline">\(\varepsilon\)</span> are continuous. However, an adaptation of these methods, that is widely used in, for example, medical studies, gives us a way to estimate effects along with their standard errors.</p>
 <p>The idea is to model a transformation of the expected value of the outcomes with a linear model. The transformation is selected so that any continuous value is possible. The mathematical equation for a model with one variable looks like this:</p>
 <p><span class="math display">\[
 g\{\mbox{E}(Y_i)\} = \beta_0 + \beta_1 x_i
 \]</span></p>
-<p>To finish describing the model we impose a distribution on <span class="math inline">\(Y\)</span> such as binomial or Poisson. These are referred to as _generalized linear models.</p>
-<p>We demonstrate with the funding rates example. We define <span class="math inline">\(Y_i\)</span> to be 1 if person <span class="math inline">\(i\)</span> received funding and 0 otherwise and <span class="math inline">\(x_i\)</span> to be 1 for person <span class="math inline">\(i\)</span> is a women and 0 for men. For this data the expected value of <span class="math inline">\(Y_i\)</span> is the probability of funding for person <span class="math inline">\(i\)</span> <span class="math inline">\(\mbox{Pr}(Y_i=1)\)</span>. We assume the outcomes <span class="math inline">\(Y_i\)</span> are binomial with <span class="math inline">\(N=1\)</span> and probability <span class="math inline">\(p_i\)</span>. For binomial data, the most widely used transformation is the logit function <span class="math inline">\(g(p) = \log \{p / (1-p)\}\)</span> which takes numbers between 0 and 1 to any continuous number. The model looks like this:</p>
+<p>To finish describing the model, we impose a distribution on <span class="math inline">\(Y\)</span>, such as binomial or Poisson. These are referred to as <em>generalized linear models</em>.</p>
+<p>We illustrate this with the funding rates example. We define <span class="math inline">\(Y_i\)</span> to be 1 if person <span class="math inline">\(i\)</span> received funding and 0 otherwise, and <span class="math inline">\(x_i\)</span> to be 1 for person <span class="math inline">\(i\)</span> is a woman and 0 if they are a man. For this data, the expected value of <span class="math inline">\(Y_i\)</span> is the probability of funding for person <span class="math inline">\(i\)</span> <span class="math inline">\(\mbox{Pr}(Y_i=1)\)</span>. We assume the outcomes <span class="math inline">\(Y_i\)</span> are binomial, with <span class="math inline">\(N=1\)</span> and probability <span class="math inline">\(p_i\)</span>. For binomial data, the most widely used transformation is the logit function <span class="math inline">\(g(p) = \log \{p / (1-p)\}\)</span>, which takes numbers between 0 and 1 to any continuous number. The model looks like this:</p>
 <p><span class="math display">\[
 \log \frac{\mbox{Pr}(Y_i=1)}{1-\mbox{Pr}(Y_i=1)} = \beta_0 +  \beta_1 x_i
 \]</span></p>
-<section id="sec-odds-ratio" class="level3" data-number="17.5.1"><h3 data-number="17.5.1" class="anchored" data-anchor-id="sec-odds-ratio">
-<span class="header-section-number">17.5.1</span> The odds ratio</h3>
-<p>To understand how <span class="math inline">\(\beta_1\)</span> can be used to quantify the effect of being a woman on success rates, first note that <span class="math inline">\(\mbox{Pr}(Y_i=1)/\{1-\mbox{Pr}(Y_i=1)\} = \mbox{Pr}(Y_i=1)/\mbox{Pr}(Y_i=0)\)</span> is the <em>odds</em> of person <span class="math inline">\(i\)</span> getting funding: the ratio of the probability of success and probability of failure. This implies that <span class="math inline">\(e^{\beta_0}\)</span> is the odds for men and <span class="math inline">\(e^{\beta_0}e^{\beta_1}\)</span> is the odds for women, which implies <span class="math inline">\(\beta_1\)</span> is the odds for women divided by the odds for men. This quantity is called the <em>odds ratio</em>. To see this not that if use <span class="math inline">\(p_1\)</span> and <span class="math inline">\(p_0\)</span> to denote the probability of success for women and men, respectively, then <span class="math inline">\(e^\{beta_1\)</span> can be rewritten as</p>
+<section id="sec-odds-ratio" class="level3" data-number="18.5.1"><h3 data-number="18.5.1" class="anchored" data-anchor-id="sec-odds-ratio">
+<span class="header-section-number">18.5.1</span> The odds ratio</h3>
+<p>To understand how <span class="math inline">\(\beta_1\)</span> can be used to quantify the effect of being a woman on success rates, first note that <span class="math inline">\(\mbox{Pr}(Y_i=1)/\{1-\mbox{Pr}(Y_i=1)\} = \mbox{Pr}(Y_i=1)/\mbox{Pr}(Y_i=0)\)</span> is the <em>odds</em> of person <span class="math inline">\(i\)</span> getting funding: the ratio of the probability of success and probability of failure. This implies that <span class="math inline">\(e^{\beta_0}\)</span> is the odds for men and <span class="math inline">\(e^{\beta_0}e^{\beta_1}\)</span> is the odds for women, which implies <span class="math inline">\(e^{\beta_1}\)</span> is the odds for women divided by the odds for men. This quantity is called the <em>odds ratio</em>. To see this, note that if use <span class="math inline">\(p_1\)</span> and <span class="math inline">\(p_0\)</span> to denote the probability of success for women and men, respectively, then <span class="math inline">\(e^\{beta_1\)</span> can be rewritten as:</p>
 <p><span class="math display">\[
 e^{\beta_1} = \frac{p_1}{1-p_1} \, / \, \frac{p_0}{1-p_0}
 \]</span></p>
 <p><span class="math inline">\(\beta_1\)</span> therefore quantifies the <em>log odds ratio</em>.</p>
-<p>Now how do we estimate these parameters? Although the details are not described in this book, least squares is no longer an optimal way of estimating the parameters and instead we use an approach called <em>maximum likelihood estimation</em> (MLE). More advanced mathematical derivations show that a version of the central limit theorem applies and the estimates obtained this way are approximately normal when th number of observations is large. The theory also provides a way to calculate standard errors for the estimates of the <span class="math inline">\(\beta\)</span>s.</p>
-</section><section id="fitting-the-model" class="level3" data-number="17.5.2"><h3 data-number="17.5.2" class="anchored" data-anchor-id="fitting-the-model">
-<span class="header-section-number">17.5.2</span> Fitting the model</h3>
-<p>To obtain the maximum likelihood estimates using R we can use the <code>glm</code> function with the <code>family</code> argument set to <code>binomial</code>. This defaults to using the logit transformation. Note that we do not have the individual level data, but because we our model assumes the probability of success is the same for all women and all men, then the number of success can be modeled as binomial with <span class="math inline">\(N_1\)</span> trials and probability <span class="math inline">\(p_1\)</span> for women and binomial with <span class="math inline">\(N_0\)</span> trials and probability <span class="math inline">\(p_0\)</span> for men, with <span class="math inline">\(N_1\)</span> and <span class="math inline">\(N_0\)</span> the total number of women and men. In this case the <code>glm</code> function is used like this:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-15_937dfc3c727dfc59f0554f8754cc8b68">
-<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">success</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">totals</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">yes_men</span>, <span class="va">yes_women</span><span class="op">)</span><span class="op">)</span></span>
+<p>Now how do we estimate these parameters? Although the details are not described in this book, least squares is no longer an optimal way of estimating the parameters and instead we use an approach called <em>maximum likelihood estimation</em> (MLE). More advanced mathematical derivations show that a version of the central limit theorem applies, and the estimates obtained this way are approximately normal when the number of observations is large. The theory also provides a way to calculate standard errors for the estimates of the <span class="math inline">\(\beta\)</span>s.</p>
+</section><section id="fitting-the-model" class="level3" data-number="18.5.2"><h3 data-number="18.5.2" class="anchored" data-anchor-id="fitting-the-model">
+<span class="header-section-number">18.5.2</span> Fitting the model</h3>
+<p>To obtain the maximum likelihood estimates using R, we can use the <code>glm</code> function with the <code>family</code> argument set to <code>binomial</code>. This defaults to using the logit transformation. Note that we do not have the individual level data, but because our model assumes the probability of success is the same for all women and all men, then the number of success can be modeled as binomial with <span class="math inline">\(N_1\)</span> trials and probability <span class="math inline">\(p_1\)</span> for women and binomial with <span class="math inline">\(N_0\)</span> trials and probability <span class="math inline">\(p_0\)</span> for men, where <span class="math inline">\(N_1\)</span> and <span class="math inline">\(N_0\)</span> are the total number of women and men. In this case, the <code>glm</code> function is used as follows:</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-18_c4f04703e628316dc577c408985fe3b1">
+<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">success</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">totals</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">yes_men</span>, <span class="va">yes_women</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">failure</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">totals</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">no_men</span>, <span class="va">no_women</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">gender</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"men"</span>, <span class="st">"women"</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/glm.html">glm</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/cbind.html">cbind</a></span><span class="op">(</span><span class="va">success</span>, <span class="va">failure</span><span class="op">)</span> <span class="op">~</span> <span class="va">gender</span>, family <span class="op">=</span> <span class="st">"binomial"</span><span class="op">)</span> </span>
@@ -579,9 +618,9 @@ <h1 class="title"><span id="sec-association-tests" class="quarto-section-identif
 <span><span class="co">#&gt; (Intercept)   -1.534     0.0647   -23.7 3.83e-124</span></span>
 <span><span class="co">#&gt; genderwomen   -0.208     0.1041    -2.0  4.54e-02</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The estimate of the odds ratio is 0.811982 which is interpreted as the odds being lowered by 20% for women as compared to men. But is this due to chance? We already noted that the p-value is about 0.05, but the GLM approach also permits us to compute confidence intervals using the <code>confint</code> function. To show the interval for the more interpretable odds ratio we simply exponentiate:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-16_a499b8f73465de532386233668d5e4e5">
-<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Log.html">exp</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/confint.html">confint</a></span><span class="op">(</span><span class="va">fit</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
+<p>The estimate of the odds ratio is 0.811982, interpreted as the odds being lowered by 20% for women compared to men. But is this due to chance? We already noted that the p-value is about 0.05, but the GLM approach also permits us to compute confidence intervals using the <code>confint</code> function. To show the interval for the more interpretable odds ratio, we simply exponentiate:</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-19_9b9743a3f7b5b005d17f4f4174ef30a8">
+<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Log.html">exp</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/confint.html">confint</a></span><span class="op">(</span><span class="va">fit</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt;  2.5 % 97.5 % </span></span>
 <span><span class="co">#&gt;  0.661  0.995</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
@@ -591,15 +630,15 @@ <h1 class="title"><span id="sec-association-tests" class="quarto-section-identif
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>We have used a simple version of GLMs in which the only variable is binary. However, the method can be expanded to use multiple variables including continuous ones. However, in these contexts the log odds ratio interpretation becomes more complex. Also note that we have shown just one version of GLM appropriate for binomial data using a logit transformation. This version is referred to often referred to as <em>logistic regression</em>. However, GLM can be used with other transformation and distributions. You can learn more by consulting a GLM text book.</p>
+<p>We have used a simple version of GLMs in which the only variable is binary. However, the method can be expanded to incorporate multiple variables, including continuous ones. In these contexts, the log odds ratio interpretation becomes more complex. Also, note that we have shown just one version of GLM appropriate for binomial data using a logit transformation. This version is often referred to as <em>logistic regression</em>. Nevertheless, GLM can be used with other transformation and distributions. You can learn more by consulting a GLM textbook.</p>
 </div>
 </div>
 </div>
-</section><section id="simple-standard-error-approximation-for-two-by-two-table-odds-ratio" class="level3" data-number="17.5.3"><h3 data-number="17.5.3" class="anchored" data-anchor-id="simple-standard-error-approximation-for-two-by-two-table-odds-ratio">
-<span class="header-section-number">17.5.3</span> Simple standard error approximation for two-by-two table odds ratio</h3>
-<p>Using <code>glm</code> we can obtain estimates, standard errors, and confidence intervals for a wide range of models. To do this we use a rather complex algorithms. In the case of two-by-two tables we can obtain a standard error for the log odds ratio using a simple approximation.</p>
-<p>If our two-by-two tables has the following entries:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-17_ad4d14f40ec37312142e8ba574a379e8">
+</section><section id="simple-standard-error-approximation-for-two-by-two-table-odds-ratio" class="level3" data-number="18.5.3"><h3 data-number="18.5.3" class="anchored" data-anchor-id="simple-standard-error-approximation-for-two-by-two-table-odds-ratio">
+<span class="header-section-number">18.5.3</span> Simple standard error approximation for two-by-two table odds ratio</h3>
+<p>Using <code>glm</code>, we can obtain estimates, standard errors, and confidence intervals for a wide range of models. To do this, we use a rather complex algorithms. In the case of two-by-two tables. we can obtain a standard error for the log odds ratio using a simple approximation.</p>
+<p>FIX SEE WHAT FOLLOWS If our two-by-two tables have the following entries:</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-20_3c79cacbcf1f692766f3571c8d5fd386">
 <div class="cell-output-display">
 <table class="table table-striped table-sm small" data-quarto-postprocess="true">
 <thead><tr class="header">
@@ -622,14 +661,18 @@ <h1 class="title"><span id="sec-association-tests" class="quarto-section-identif
 </table>
 </div>
 </div>
-<p>In this case, the odds ratio is simply <span class="math inline">\(\frac{a/c}{b/d} = \frac{ad}{bc}\)</span>. We can confirm we obtain the same estimate as when using <code>glm</code>:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-18_56d460ce04df273b4c95f360e67b8f6c">
-<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">or</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">two_by_two</span>, <span class="va">women</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">women</span><span class="op">)</span> <span class="op">/</span> <span class="op">(</span><span class="va">women</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">women</span><span class="op">)</span><span class="op">)</span> <span class="op">/</span> <span class="op">(</span><span class="op">(</span><span class="va">men</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">men</span><span class="op">)</span><span class="op">)</span> <span class="op">/</span> <span class="op">(</span><span class="va">men</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">men</span><span class="op">)</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
+<p>In this case, the odds ratio is simply <span class="math inline">\(\frac{a/c}{b/d} = \frac{ad}{bc}\)</span>. We can confirm that we obtain the same estimate as when using <code>glm</code>:</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-21_0070ed132eacea3c20849528a9d16659">
+<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">two_by_two</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">totals</span>, <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>awarded <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"no"</span>, <span class="st">"yes"</span><span class="op">)</span>, </span>
+<span>                                      men <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">no_men</span>, <span class="va">yes_men</span><span class="op">)</span>,</span>
+<span>                                      women <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">no_women</span>, <span class="va">yes_women</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">or</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">two_by_two</span>, <span class="va">women</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">women</span><span class="op">)</span> <span class="op">/</span> <span class="op">(</span><span class="va">women</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">women</span><span class="op">)</span><span class="op">)</span> <span class="op">/</span> <span class="op">(</span><span class="op">(</span><span class="va">men</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">men</span><span class="op">)</span><span class="op">)</span> <span class="op">/</span> <span class="op">(</span><span class="va">men</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">men</span><span class="op">)</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Log.html">log</a></span><span class="op">(</span><span class="va">or</span><span class="op">)</span>, <span class="va">fit</span><span class="op">$</span><span class="va">coef</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span><span class="op">)</span></span>
 <span><span class="co">#&gt;             genderwomen </span></span>
 <span><span class="co">#&gt;      -0.208      -0.208</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Statistical theory tells us that when all four entries of the two-by-two table are large enough, then the log odds ratio is approximately normal with standard error</p>
+<p>Statistical theory tells us that when all four entries of the two-by-two table are large enough, the log odds ratio is approximately normal with standard error:</p>
 <p><span class="math display">\[
 \sqrt{1/a + 1/b + 1/c + 1/d}
 \]</span></p>
@@ -637,18 +680,18 @@ <h1 class="title"><span id="sec-association-tests" class="quarto-section-identif
 <p><span class="math display">\[
 \log\left(\frac{ad}{bc}\right) \pm 1.96 \sqrt{1/a + 1/b + 1/c + 1/d}
 \]</span></p>
-<p>By exponentiating these two numbers we can construct a confidence interval of the odds ratio.</p>
-<p>Using R we can compute this confidence interval as follows:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-19_b93e67b9dcf1938048fe1742be2cb210">
-<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">se</span> <span class="op">&lt;-</span> <span class="va">two_by_two</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="op">-</span><span class="va">awarded</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<p>By exponentiating these two numbers, we can construct a confidence interval of the odds ratio.</p>
+<p>Using R, we can compute this confidence interval as follows:</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-22_38158fd5264e2ac28e62ef8d3a7dfe1b">
+<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">se</span> <span class="op">&lt;-</span> <span class="va">two_by_two</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="op">-</span><span class="va">awarded</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>se <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="va">men</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="va">women</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">se</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/Log.html">exp</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Log.html">log</a></span><span class="op">(</span><span class="va">or</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>,<span class="fl">1</span><span class="op">)</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">qnorm</a></span><span class="op">(</span><span class="fl">0.975</span><span class="op">)</span> <span class="op">*</span> <span class="va">se</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.662 0.996</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Note that 1 is not included in the confidence interval which must mean that the p-value is smaller than 0.05. We can confirm this using:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-20_387e7df368d2addda8046b9482b6a93f">
-<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fl">2</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">abs</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Log.html">log</a></span><span class="op">(</span><span class="va">or</span><span class="op">)</span><span class="op">)</span>, <span class="fl">0</span>, <span class="va">se</span><span class="op">)</span><span class="op">)</span></span>
+<p>Note that 1 is not included in the confidence interval, implying that the p-value is smaller than 0.05. We can confirm this using:</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-23_06292b99027bbdc380233bee2f1b617e">
+<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fl">2</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">abs</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Log.html">log</a></span><span class="op">(</span><span class="va">or</span><span class="op">)</span><span class="op">)</span>, <span class="fl">0</span>, <span class="va">se</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.0454</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <div class="callout callout-style-simple callout-warning">
@@ -657,25 +700,34 @@ <h1 class="title"><span id="sec-association-tests" class="quarto-section-identif
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>Note that the p-values obtained with <code>chisq.test</code>, <code>glm</code> and this simple approximation are all slightly different. This is because these are both based on different approximation approaches.</p>
+<p>Keep in mind that the p-values obtained with <code>chisq.test</code>, <code>glm</code> and this simple approximation are all slightly different. This is because these are both based on different approximation approaches.</p>
 </div>
 </div>
 </div>
-</section></section><section id="large-samples-small-p-values" class="level2" data-number="17.6"><h2 data-number="17.6" class="anchored" data-anchor-id="large-samples-small-p-values">
-<span class="header-section-number">17.6</span> Large samples, small p-values</h2>
-<p>As mentioned earlier, reporting only p-values is not an appropriate way to report the results of data analysis. In scientific journals, for example, some studies seem to overemphasize p-values. Some of these studies have large sample sizes and report impressively small p-values. Yet when one looks closely at the results, we realize odds ratios are quite modest: barely bigger than 1. In this case the difference may not be <em>practically significant</em> or <em>scientifically significant</em>.</p>
-<p>Note that the relationship between odds ratio and p-value is not one-to-one. It depends on the sample size. So a very small p-value does not necessarily mean a very large odds ratio. Notice what happens to the p-value if we multiply our two-by-two table by 10, which does not change the odds ratio:</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-21_bdfe1157937f610d01922d73842ad350">
-<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">two_by_two_x_10</span> <span class="op">&lt;-</span> <span class="va">two_by_two</span> <span class="op">|&gt;</span> </span>
+</section></section><section id="large-samples-small-p-values" class="level2" data-number="18.6"><h2 data-number="18.6" class="anchored" data-anchor-id="large-samples-small-p-values">
+<span class="header-section-number">18.6</span> Large samples, small p-values</h2>
+<p>As mentioned earlier, reporting only p-values is not an appropriate way to report the results of data analysis. In scientific journals, for example, some studies seem to overemphasize p-values. Some of these studies have large sample sizes and report impressively small p-values. Yet by looking closely at the results, we realize that the odds ratios are quite modest: barely bigger than 1. In this case, the difference may not be <em>practically significant</em> or <em>scientifically significant</em>.</p>
+<p>Note that the relationship between odds ratio and p-value is not one-to-one; it depends on the sample size. Therefore, a very small p-value does not necessarily mean a very large odds ratio. Observe what happens to the p-value if we multiply our two-by-two table by 10, which does not change the odds ratio:</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-24_4df8af87b0141d227757880ded068b65">
+<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">two_by_two_x_10</span> <span class="op">&lt;-</span> <span class="va">two_by_two</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="op">-</span><span class="va">awarded</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>men <span class="op">=</span> <span class="va">men</span><span class="op">*</span><span class="fl">10</span>, women <span class="op">=</span> <span class="va">women</span><span class="op">*</span><span class="fl">10</span><span class="op">)</span> </span>
 <span><span class="fu"><a href="https://rdrr.io/r/stats/chisq.test.html">chisq.test</a></span><span class="op">(</span><span class="va">two_by_two_x_10</span><span class="op">)</span><span class="op">$</span><span class="va">p.value</span></span>
 <span><span class="co">#&gt; [1] 2.63e-10</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>:::{.callout-note title = “Small count correction”} Note that the log odds ratio is not defined if any of the cells of the two-by-two table is 0. This is because if <span class="math inline">\(a\)</span>, <span class="math inline">\(b\)</span>, <span class="math inline">\(c\)</span>, or <span class="math inline">\(d\)</span> is 0, the <span class="math inline">\(\log(\frac{ad}{bc})\)</span> is either the log of 0 or has a 0 in the denominator. For this situation, it is common practice to avoid 0s by adding 0.5 to each cell. This is referred to as the <em>Haldane–Anscombe correction</em> and has been shown, both in practice and theory, to work well. :::</p>
-</section><section id="exercises" class="level2" data-number="17.7"><h2 data-number="17.7" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">17.7</span> Exercises</h2>
-<p>1. A famous athlete has an impressive career, winning 70% of her 500 career matches. However, this athlete gets criticized because in important events, such as the Olympics, she has a losing record of 8 wins and 9 losses. Perform a Chi-square test to determine if this losing record can be simply due to chance as opposed to not performing well under pressure.</p>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>Also, note that the log odds ratio is not defined if any of the cells of the two-by-two table is 0. This is because if <span class="math inline">\(a\)</span>, <span class="math inline">\(b\)</span>, <span class="math inline">\(c\)</span>, or <span class="math inline">\(d\)</span> are 0, the <span class="math inline">\(\log(\frac{ad}{bc})\)</span> is either the log of 0 or has a 0 in the denominator. For this situation, it is common practice to avoid 0s by adding 0.5 to each cell. This is referred to as the <em>Haldane–Anscombe correction</em> and has been shown, both in practice and theory, to work well.</p>
+</div>
+</div>
+</div>
+</section><section id="exercises" class="level2" data-number="18.7"><h2 data-number="18.7" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">18.7</span> Exercises</h2>
+<p>1. A famous athlete boasts an impressive career, winning 70% of her 500 career matches. Nevertheless, this athlete is criticized because in important events, such as the Olympics, she has a losing record of 8 wins and 9 losses. Perform a Chi-square test to determine if this losing record can be simply due to chance as opposed to not performing well under pressure.</p>
 <p>2. Why did we use the Chi-square test instead of Fisher’s exact test in the previous exercise?</p>
 <ol type="a">
 <li>It actually does not matter, since they give the exact same p-value.</li>
@@ -684,47 +736,27 @@ <h1 class="title"><span id="sec-association-tests" class="quarto-section-identif
 <li>Because the Chi-square test runs faster.</li>
 </ol>
 <p>3. Compute the odds ratio of “losing under pressure” along with a confidence interval.</p>
-<p>4. Notice that the p-value is larger than 0.05 but the 95% confidence interval does not include 1. What explains this?</p>
+<p>4. Notice that the p-value is larger than 0.05, but the 95% confidence interval does not include 1. What explains this?</p>
 <ol type="a">
 <li>We made a mistake in our code.</li>
 <li>These are based on t-statistics so the connection between p-value and confidence intervals does not apply.</li>
-<li>Different approximations are used for the p-value and the confidence interval calculation. If we had a larger sample size the match would be better.</li>
+<li>Different approximations are used for the p-value and the confidence interval calculation. If we had a larger sample size, the match would be better.</li>
 <li>We should use the Fisher exact test to get confidence intervals.</li>
 </ol>
 <p>5. Multiply the two-by-two table by 2 and see if the p-value and confidence retrieval are a better match.</p>
-<p>6. During the 2016 US presidential election, then candidate Donald J. Trump used his twitter account as a way to communicate with potential voters. Todd Vaziri hypothesized that “Every non-hyperbolic tweet is from iPhone (his staff). Every hyperbolic tweet is from Android (from him).” We will test this hypothesis using association tests. The following code coverts the tweets to a table with the counts for several sentiments from each source (Android or iPhone):</p>
-<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-22_1a14e06ad5ad1be51d063daa78db9b78">
-<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
+<p>6. FIX Use the <code>research_funding_rates</code> data to estimate the log odds ratio along and standard errors comparing women to men for each discipline. Compute a confidence interval and report all the disciplines for which one gender appears to be favored over the other.</p>
+<p>7. Divide the log odds ratio estimates by their respective standard errors and generate a qqplot comparing these to a standard normal. Do any of the disciplines deviate from what is expected by chance?</p>
+<p>8. During the 2016 US presidential election, then candidate Donald J. Trump used his twitter account as a way to communicate with potential voters. Todd Vaziri hypothesized that “Every non-hyperbolic tweet is from iPhone (his staff). Every hyperbolic tweet is from Android (from him).” We will test this hypothesis using association tests. The <strong>dslabs</strong> object <code>sentiment_counts</code> provides a table with the counts for several sentiments from each source (Android or iPhone):</p>
+<div class="cell" data-layout-align="center" data-hash="association-tests_cache/html/unnamed-chunk-25_aa4aae33befc31aae3556ee0443672a7">
+<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
 <span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
-<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/juliasilge/tidytext">tidytext</a></span><span class="op">)</span></span>
-<span></span>
-<span><span class="va">links</span> <span class="op">&lt;-</span> <span class="st">"https://t.co/[A-Za-z\\d]+|&amp;amp;"</span></span>
-<span><span class="va">nrc</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/tidytext/man/get_sentiments.html">get_sentiments</a></span><span class="op">(</span><span class="st">"nrc"</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">word</span>, <span class="va">sentiment</span><span class="op">)</span></span>
-<span><span class="va">android_iphone</span> <span class="op">&lt;-</span> <span class="va">trump_tweets</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://tidyr.tidyverse.org/reference/extract.html">extract</a></span><span class="op">(</span><span class="va">source</span>, <span class="st">"source"</span>, <span class="st">"Twitter for (.*)"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">source</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Android"</span>, <span class="st">"iPhone"</span><span class="op">)</span> <span class="op">&amp;</span></span>
-<span>           <span class="va">created_at</span> <span class="op">&gt;=</span> <span class="fu"><a href="https://lubridate.tidyverse.org/reference/ymd.html">ymd</a></span><span class="op">(</span><span class="st">"2015-06-17"</span><span class="op">)</span> <span class="op">&amp;</span> </span>
-<span>           <span class="va">created_at</span> <span class="op">&lt;</span> <span class="fu"><a href="https://lubridate.tidyverse.org/reference/ymd.html">ymd</a></span><span class="op">(</span><span class="st">"2016-11-08"</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="op">!</span><span class="va">is_retweet</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/arrange.html">arrange</a></span><span class="op">(</span><span class="va">created_at</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>text <span class="op">=</span> <span class="fu"><a href="https://stringr.tidyverse.org/reference/str_replace.html">str_replace_all</a></span><span class="op">(</span><span class="va">text</span>, <span class="va">links</span>, <span class="st">""</span><span class="op">)</span><span class="op">)</span>  <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/pkg/tidytext/man/unnest_tokens.html">unnest_tokens</a></span><span class="op">(</span><span class="va">word</span>, <span class="va">text</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="op">!</span><span class="va">word</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">stop_words</span><span class="op">$</span><span class="va">word</span> <span class="op">&amp;</span></span>
-<span>           <span class="op">!</span><span class="fu"><a href="https://stringr.tidyverse.org/reference/str_detect.html">str_detect</a></span><span class="op">(</span><span class="va">word</span>, <span class="st">"^\\d+$"</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>word <span class="op">=</span> <span class="fu"><a href="https://stringr.tidyverse.org/reference/str_replace.html">str_replace</a></span><span class="op">(</span><span class="va">word</span>, <span class="st">"^'"</span>, <span class="st">""</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="op">!</span><span class="va">word</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">stop_words</span><span class="op">$</span><span class="va">word</span><span class="op">)</span></span>
-<span></span>
-<span><span class="va">sentiment_counts</span> <span class="op">&lt;-</span> <span class="va">android_iphone</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html">left_join</a></span><span class="op">(</span><span class="va">nrc</span>, by <span class="op">=</span> <span class="st">"word"</span>, relationship <span class="op">=</span> <span class="st">"many-to-many"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/count.html">count</a></span><span class="op">(</span><span class="va">source</span>, <span class="va">sentiment</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://tidyr.tidyverse.org/reference/pivot_wider.html">pivot_wider</a></span><span class="op">(</span>names_from <span class="op">=</span> <span class="st">"source"</span>, values_from <span class="op">=</span> <span class="st">"n"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>sentiment <span class="op">=</span> <span class="fu"><a href="https://tidyr.tidyverse.org/reference/replace_na.html">replace_na</a></span><span class="op">(</span><span class="va">sentiment</span>, replace <span class="op">=</span> <span class="st">"none"</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="va">sentiment_counts</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Compute an odds ratio comparing Android to iPhone for each sentiment and add it to the table.</p>
-<p>7. Compute a 95% confidence interval for each odds ratio.</p>
-<p>8. Generate a plot showing the estimated odds ratios along with their confidence intervals.</p>
-<p>9. Test the null hypothesis that there is no difference between tweets from Android and iPhone and report the sentiments with p-values less than 0.05 and more likely to come from Android.</p>
-<p>10. For each sentiment, find the words assigned to that sentiment, keep words that appear at least 25 times, compute the odd ratio for each, and show a barplot for those with odds ratio larger than 2 or smaller than 1/2.</p>
+<p>9. Compute a 95% confidence interval for each odds ratio.</p>
+<p>10. Generate a plot showing the estimated odds ratios along with their confidence intervals.</p>
+<p>11. FIX Test the null hypothesis that there is no difference between tweets from Android and iPhone and report the sentiments with p-values less than 0.05 and more likely to come from Android.</p>
+<p>12. For each sentiment, find the words assigned to that sentiment, keep words that appear at least 25 times, compute the odd ratio for each, and show a barplot for those with odds ratio larger than 2 or smaller than 1/2.</p>
 
 
 </section><section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes"><hr>
@@ -965,12 +997,12 @@ <h1 class="title"><span id="sec-association-tests" class="quarto-section-identif
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../linear-models/treatment-effect-models.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../linear-models/association-not-causation.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/linear-models/intro-to-linear-models.html b/docs/linear-models/intro-to-linear-models.html
index bac4932..9c7fc87 100644
--- a/docs/linear-models/intro-to-linear-models.html
+++ b/docs/linear-models/intro-to-linear-models.html
@@ -203,23 +203,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -236,37 +242,37 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -283,31 +289,31 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -324,49 +330,49 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,7 +405,7 @@ <h1 class="title">Linear Models</h1>
 
 </header>
 
-<p>Up to this point, this book has focused mainly on datasets consisting of a single variable. However, in data analyses challenges, it is very common to be interested in the relationship between two or more variables. In this part of the book we introduce <em>linear models</em>, a general framework that unifies approaches used for analyzing association between variables, such as simple and multivariate regression, treatment effect models, and association test. We will illustrate these using case studies related to understudying if height is hereditary, described in detail in Chapter <a href="regression.html"><span>Chapter&nbsp;13</span></a>, using data to build a baseball team on a budget, described in detail in Chapter <a href="multivariate-regression.html"><span>Chapter&nbsp;14</span></a>, determining if a high-fat diet makes mice heavier, described in detail in Chapter <a href="treatment-effect-models.html"><span>Chapter&nbsp;16</span></a>, and examining if their is gender bias in research funding in the Netherlands, described in detail in Chapter <a href="association-tests.html"><span>Chapter&nbsp;17</span></a>.</p>
+<p>Up to this point, this book has focused mainly on datasets consisting of a single variable. However, in data analyses challenges, it is very common to be interested in the relationship between two or more variables. In this part of the book we introduce <em>linear models</em>, a general framework that unifies approaches used for analyzing association between variables, such as simple and multivariate regression, treatment effect models, and association test. We will illustrate these using case studies related to understudying if height is hereditary, described in detail in Chapter <a href="regression.html"><span>Chapter&nbsp;14</span></a>, using data to build a baseball team on a budget, described in detail in Chapter <a href="multivariate-regression.html"><span>Chapter&nbsp;15</span></a>, determining if a high-fat diet makes mice heavier, described in detail in Chapter <a href="treatment-effect-models.html"><span>Chapter&nbsp;17</span></a>, and examining if their is gender bias in research funding in the Netherlands, described in detail in Chapter <a href="association-tests.html"><span>Chapter&nbsp;18</span></a>.</p>
 
 
 
@@ -640,12 +646,12 @@ <h1 class="title">Linear Models</h1>
 <nav class="page-navigation">
   <div class="nav-page nav-page-previous">
       <a href="../inference/hierarchical-models.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../linear-models/regression.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/linear-models/measurement-error-models.html b/docs/linear-models/measurement-error-models.html
index bb1059a..aa2e53a 100644
--- a/docs/linear-models/measurement-error-models.html
+++ b/docs/linear-models/measurement-error-models.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 15&nbsp; Measurement error models</title>
+<title>Advanced Data Science - 16&nbsp; Measurement error models</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../linear-models/intro-to-linear-models.html">Linear Models</a></li><li class="breadcrumb-item"><a href="../linear-models/measurement-error-models.html"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../linear-models/intro-to-linear-models.html">Linear Models</a></li><li class="breadcrumb-item"><a href="../linear-models/measurement-error-models.html"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,13 +405,15 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#exercises" id="toc-exercises" class="nav-link active" data-scroll-target="#exercises"><span class="header-section-number">15.1</span> Exercises</a></li>
+<li><a href="#example-modeling-a-falling-object" id="toc-example-modeling-a-falling-object" class="nav-link active" data-scroll-target="#example-modeling-a-falling-object"><span class="header-section-number">16.1</span> Example: modeling a falling object</a></li>
+  <li><a href="#estimating-parameters-with-least-squares" id="toc-estimating-parameters-with-least-squares" class="nav-link" data-scroll-target="#estimating-parameters-with-least-squares"><span class="header-section-number">16.2</span> Estimating parameters with least squares</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">16.3</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/linear-models/measurement-error-models.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
 <h1 class="title">
-<span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span>
+<span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span>
 </h1>
 </div>
 
@@ -419,15 +427,17 @@ <h1 class="title">
   </div>
   
 
-</header><p>Another major application of linear models comes from measurement errors models. In these applications, it is common to have a non-random covariate, such as time, and randomness is introduced from measurement error rather than sampling or natural variability.</p>
-<p>To understand these models, imagine you are Galileo in the 16th century trying to describe the velocity of a falling object. An assistant climbs the Tower of Pisa and drops a ball, while several other assistants record the position at different times. Let’s simulate some data using the equations we know today and adding some measurement error. The <strong>dslabs</strong> function <code>rfalling_object</code> generates these simulations:</p>
+</header><p>Another major application of linear models occurs in measurement errors models. In these situations, non-random covariates, such as time, are frequently encountered, with randomness often arising from measurement errors rather than from sampling or inherent natural variability.</p>
+<section id="example-modeling-a-falling-object" class="level2" data-number="16.1"><h2 data-number="16.1" class="anchored" data-anchor-id="example-modeling-a-falling-object">
+<span class="header-section-number">16.1</span> Example: modeling a falling object</h2>
+<p>To understand these models, imagine you are Galileo in the 16th century trying to describe the velocity of a falling object. An assistant climbs the Tower of Pisa and drops a ball, while several other assistants record the position at different times. Let’s simulate some data using the equations we currently know and adding some measurement error. The <strong>dslabs</strong> function <code>rfalling_object</code> generates these simulations:</p>
 <div class="cell" data-layout-align="center">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
 <span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://broom.tidymodels.org/">broom</a></span><span class="op">)</span></span>
 <span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
 <span><span class="va">falling_object</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/dslabs/man/rfalling_object.html">rfalling_object</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The assistants hand the data to Galileo and this is what he sees:</p>
+<p>The assistants hand the data to Galileo, and this is what he sees:</p>
 <div class="cell" data-layout-align="center" data-hash="measurement-error-models_cache/html/gravity_aecf2a87e7f78fb6e0cfbde7eb1bb4db">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">falling_object</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">time</span>, <span class="va">observed_distance</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
@@ -451,8 +461,10 @@ <h1 class="title">
 \]</span></p>
 <p>with <span class="math inline">\(Y_i\)</span> representing distance in meters, <span class="math inline">\(x_i\)</span> representing time in seconds, and <span class="math inline">\(\varepsilon\)</span> accounting for measurement error. The measurement error is assumed to be random, independent from each other, and having the same distribution for each <span class="math inline">\(i\)</span>. We also assume that there is no bias, which means the expected value <span class="math inline">\(\mbox{E}[\varepsilon] = 0\)</span>.</p>
 <p>Note that this is a linear model because it is a linear combination of known quantities (<span class="math inline">\(x\)</span> and <span class="math inline">\(x^2\)</span> are known) and unknown parameters (the <span class="math inline">\(\beta\)</span>s are unknown parameters to Galileo). Unlike our previous examples, here <span class="math inline">\(x\)</span> is a fixed quantity; we are not conditioning.</p>
+</section><section id="estimating-parameters-with-least-squares" class="level2" data-number="16.2"><h2 data-number="16.2" class="anchored" data-anchor-id="estimating-parameters-with-least-squares">
+<span class="header-section-number">16.2</span> Estimating parameters with least squares</h2>
 <p>To pose a new physical theory and start making predictions about other falling objects, Galileo needs actual numbers, rather than unknown parameters. Using LSE seems like a reasonable approach. How do we find the LSE?</p>
-<p>LSE calculations do not require the errors to be approximately normal. The <code>lm</code> function will find the <span class="math inline">\(\beta\)</span> s that will minimize the residual sum of squares:</p>
+<p>LSE calculations do not require the errors to be approximately normal. The <code>lm</code> function will find the <span class="math inline">\(\beta\)</span>s that will minimize the residual sum of squares:</p>
 <div class="cell" data-layout-align="center" data-hash="measurement-error-models_cache/html/unnamed-chunk-2_457bf14996376d4ef7741f2a02f58cb2">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="va">falling_object</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>time_sq <span class="op">=</span> <span class="va">time</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
@@ -461,11 +473,11 @@ <h1 class="title">
 <span><span class="co">#&gt; # A tibble: 3 × 5</span></span>
 <span><span class="co">#&gt;   term        estimate std.error statistic  p.value</span></span>
 <span><span class="co">#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;</span></span>
-<span><span class="co">#&gt; 1 (Intercept)   55.1       0.563    98.0   1.56e-17</span></span>
-<span><span class="co">#&gt; 2 time           0.724     0.804     0.901 3.87e- 1</span></span>
-<span><span class="co">#&gt; 3 time_sq       -5.06      0.239   -21.2   2.85e-10</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt; 1 (Intercept)   56.1       0.861    65.1   1.38e-15</span></span>
+<span><span class="co">#&gt; 2 time          -0.618     1.23     -0.503 6.25e- 1</span></span>
+<span><span class="co">#&gt; 3 time_sq       -4.72      0.365   -12.9   5.37e- 8</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Let’s check if the estimated parabola fits the data. The <strong>broom</strong> function <code>augment</code> lets us do this easily:</p>
+<p>Let’s check if the estimated parabola fits the data. The <strong>broom</strong> function <code>augment</code> allows us to do this easily:</p>
 <div class="cell" data-layout-align="center" data-hash="measurement-error-models_cache/html/falling-object-fit_17fb693544a09610da7eec016e1868e4">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://generics.r-lib.org/reference/augment.html">augment</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span></span>
@@ -482,27 +494,29 @@ <h1 class="title">
 <p><span class="math display">\[
 d(t) = h_0 + v_0 t -  0.5 \times 9.8 \, t^2
 \]</span></p>
-<p>with <span class="math inline">\(h_0\)</span> and <span class="math inline">\(v_0\)</span> the starting height and velocity, respectively. The data we simulated above followed this equation and added measurement error to simulate <code>n</code> observations for dropping the ball <span class="math inline">\((v_0=0)\)</span> from the tower of Pisa <span class="math inline">\((h_0=55.86)\)</span>.</p>
+<p>with <span class="math inline">\(h_0\)</span> and <span class="math inline">\(v_0\)</span> the starting height and velocity, respectively. The data we simulated above followed this equation, adding measurement error to simulate <code>n</code> observations for dropping the ball <span class="math inline">\((v_0=0)\)</span> from the tower of Pisa <span class="math inline">\((h_0=55.86)\)</span>.</p>
 <p>These are consistent with the parameter estimates:</p>
 <div class="cell" data-layout-align="center" data-hash="measurement-error-models_cache/html/unnamed-chunk-3_edb72d28909c532ed997c80995f3c935">
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://generics.r-lib.org/reference/tidy.html">tidy</a></span><span class="op">(</span><span class="va">fit</span>, conf.int <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
 <span><span class="co">#&gt; # A tibble: 3 × 7</span></span>
 <span><span class="co">#&gt;   term        estimate std.error statistic  p.value conf.low conf.high</span></span>
 <span><span class="co">#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;</span></span>
-<span><span class="co">#&gt; 1 (Intercept)   55.1       0.563    98.0   1.56e-17    53.9      56.4 </span></span>
-<span><span class="co">#&gt; 2 time           0.724     0.804     0.901 3.87e- 1    -1.05      2.49</span></span>
-<span><span class="co">#&gt; 3 time_sq       -5.06      0.239   -21.2   2.85e-10    -5.58     -4.53</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt; 1 (Intercept)   56.1       0.861    65.1   1.38e-15    54.2      58.0 </span></span>
+<span><span class="co">#&gt; 2 time          -0.618     1.23     -0.503 6.25e- 1    -3.33      2.09</span></span>
+<span><span class="co">#&gt; 3 time_sq       -4.72      0.365   -12.9   5.37e- 8    -5.52     -3.92</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The Tower of Pisa height is within the confidence interval for <span class="math inline">\(\beta_0\)</span>, the initial velocity 0 is in the confidence interval for <span class="math inline">\(\beta_1\)</span> (note the p-value is larger than 0.05), and the acceleration constant is in a confidence interval for <span class="math inline">\(-2 \times \beta_2\)</span>.</p>
-<section id="exercises" class="level2" data-number="15.1"><h2 data-number="15.1" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">15.1</span> Exercises</h2>
-<p>1. Plot of co2 evels for the first 12 months of the <code>co2</code> dataset and notice it seems to follow a sin wave with frequency 1 cycle per month. This means that a measurement error model that might work is</p>
+</section><section id="exercises" class="level2" data-number="16.3"><h2 data-number="16.3" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">16.3</span> Exercises</h2>
+<p>1. Plot CO2 levels for the first 12 months of the <code>co2</code> dataset and notice it seems to follow a sin wave with a frequency of 1 cycle per month. This means that a measurement error model that might work is</p>
+<p><span class="math display">\[
+y_i = \mu + A \sin(2\pi \,t_i / 12 + \phi) + \varepsilon_i
+\]</span> with <span class="math inline">\(t_i\)</span> the month number for observation <span class="math inline">\(i\)</span>. Is this a linear model for the parameters <span class="math inline">\(mu\)</span>, <span class="math inline">\(A\)</span> and <span class="math inline">\(\phi\)</span>?</p>
+<p>2. Using trigonometry, we can show that we can rewrite this model as:</p>
 <p><span class="math display">\[
-y_i = \mu + A \sin(2\pi t_i / 12 + \phi) + \varepsilon_i
-\]</span> with <span class="math inline">\(t_i\)</span> the month number of observation <span class="math inline">\(i\)</span>. Is this a linear model for the parameters <span class="math inline">\(mu\)</span>, <span class="math inline">\(A\)</span> and <span class="math inline">\(\phi\)</span>?</p>
-<p>2. Using trigonometry we can show that we can rewrite this model as</p>
-<p>$$ y_i = _0 + _1 (2t_i/12) + _2 (2t_i/12) + _i</p>
-<p>$$ Is this a linear model?</p>
+y_i = \beta_0 + \beta_1 \sin(2\pi t_i/12) + \beta_2 \cos(2\pi t_i/12) + \varepsilon_i
+\]</span></p>
+<p>Is this a linear model?</p>
 <p>3. Find least square estimates for the <span class="math inline">\(\beta\)</span>s using <code>lm</code>. Show a plot of <span class="math inline">\(y_i\)</span> versus <span class="math inline">\(t_i\)</span> with a curve on the same plot showing <span class="math inline">\(\hat{Y}_i\)</span> versus <span class="math inline">\(t_i\)</span>.</p>
 <p>4. Now fit a measurement error model to the entire <code>co2</code> dataset that includes a trend term that is a parabola as well as the sine wave model.</p>
 <p>5. Run diagnostic plots for the fitted model and describe the results.</p>
@@ -742,12 +756,12 @@ <h1 class="title">
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../linear-models/multivariate-regression.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../linear-models/treatment-effect-models.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/linear-models/measurement-error-models_files/figure-html/falling-object-fit-1.png b/docs/linear-models/measurement-error-models_files/figure-html/falling-object-fit-1.png
index 3e6e9c8..6010030 100644
Binary files a/docs/linear-models/measurement-error-models_files/figure-html/falling-object-fit-1.png and b/docs/linear-models/measurement-error-models_files/figure-html/falling-object-fit-1.png differ
diff --git a/docs/linear-models/measurement-error-models_files/figure-html/gravity-1.png b/docs/linear-models/measurement-error-models_files/figure-html/gravity-1.png
index bc86418..2aad0a0 100644
Binary files a/docs/linear-models/measurement-error-models_files/figure-html/gravity-1.png and b/docs/linear-models/measurement-error-models_files/figure-html/gravity-1.png differ
diff --git a/docs/linear-models/multivariate-regression.html b/docs/linear-models/multivariate-regression.html
index f51fc77..c8873de 100644
--- a/docs/linear-models/multivariate-regression.html
+++ b/docs/linear-models/multivariate-regression.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 14&nbsp; Multivariate Regression</title>
+<title>Advanced Data Science - 15&nbsp; Multivariate Regression</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -99,7 +99,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../linear-models/intro-to-linear-models.html">Linear Models</a></li><li class="breadcrumb-item"><a href="../linear-models/multivariate-regression.html"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../linear-models/intro-to-linear-models.html">Linear Models</a></li><li class="breadcrumb-item"><a href="../linear-models/multivariate-regression.html"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -224,23 +224,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -257,37 +263,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -304,31 +310,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -345,49 +351,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -401,33 +407,33 @@
    
   <ul>
 <li>
-<a href="#case-study-moneyball" id="toc-case-study-moneyball" class="nav-link active" data-scroll-target="#case-study-moneyball"><span class="header-section-number">14.1</span> Case study: Moneyball</a>
+<a href="#case-study-moneyball" id="toc-case-study-moneyball" class="nav-link active" data-scroll-target="#case-study-moneyball"><span class="header-section-number">15.1</span> Case study: Moneyball</a>
   <ul class="collapse">
-<li><a href="#baseball-basics" id="toc-baseball-basics" class="nav-link" data-scroll-target="#baseball-basics"><span class="header-section-number">14.1.1</span> Baseball basics</a></li>
-  <li><a href="#no-awards-for-bb" id="toc-no-awards-for-bb" class="nav-link" data-scroll-target="#no-awards-for-bb"><span class="header-section-number">14.1.2</span> No awards for BB</a></li>
-  <li><a href="#base-on-balls-or-stolen-bases" id="toc-base-on-balls-or-stolen-bases" class="nav-link" data-scroll-target="#base-on-balls-or-stolen-bases"><span class="header-section-number">14.1.3</span> Base on balls or stolen bases?</a></li>
-  <li><a href="#regression-applied-to-baseball-statistics" id="toc-regression-applied-to-baseball-statistics" class="nav-link" data-scroll-target="#regression-applied-to-baseball-statistics"><span class="header-section-number">14.1.4</span> Regression applied to baseball statistics</a></li>
+<li><a href="#baseball-basics" id="toc-baseball-basics" class="nav-link" data-scroll-target="#baseball-basics"><span class="header-section-number">15.1.1</span> Baseball basics</a></li>
+  <li><a href="#no-awards-for-bb" id="toc-no-awards-for-bb" class="nav-link" data-scroll-target="#no-awards-for-bb"><span class="header-section-number">15.1.2</span> No awards for BB</a></li>
+  <li><a href="#base-on-balls-or-stolen-bases" id="toc-base-on-balls-or-stolen-bases" class="nav-link" data-scroll-target="#base-on-balls-or-stolen-bases"><span class="header-section-number">15.1.3</span> Base on balls or stolen bases?</a></li>
+  <li><a href="#regression-applied-to-baseball-statistics" id="toc-regression-applied-to-baseball-statistics" class="nav-link" data-scroll-target="#regression-applied-to-baseball-statistics"><span class="header-section-number">15.1.4</span> Regression applied to baseball statistics</a></li>
   </ul>
 </li>
-  <li><a href="#the-broom-package" id="toc-the-broom-package" class="nav-link" data-scroll-target="#the-broom-package"><span class="header-section-number">14.2</span> The broom package</a></li>
+  <li><a href="#the-broom-package" id="toc-the-broom-package" class="nav-link" data-scroll-target="#the-broom-package"><span class="header-section-number">15.2</span> The broom package</a></li>
   <li>
-<a href="#confounding" id="toc-confounding" class="nav-link" data-scroll-target="#confounding"><span class="header-section-number">14.3</span> Confounding</a>
+<a href="#confounding" id="toc-confounding" class="nav-link" data-scroll-target="#confounding"><span class="header-section-number">15.3</span> Confounding</a>
   <ul class="collapse">
-<li><a href="#understanding-confounding-through-stratification" id="toc-understanding-confounding-through-stratification" class="nav-link" data-scroll-target="#understanding-confounding-through-stratification"><span class="header-section-number">14.3.1</span> Understanding confounding through stratification</a></li>
+<li><a href="#understanding-confounding-through-stratification" id="toc-understanding-confounding-through-stratification" class="nav-link" data-scroll-target="#understanding-confounding-through-stratification"><span class="header-section-number">15.3.1</span> Understanding confounding through stratification</a></li>
   </ul>
 </li>
   <li>
-<a href="#sec-regression-in-r" id="toc-sec-regression-in-r" class="nav-link" data-scroll-target="#sec-regression-in-r"><span class="header-section-number">14.4</span> Multivariable regression</a>
+<a href="#sec-regression-in-r" id="toc-sec-regression-in-r" class="nav-link" data-scroll-target="#sec-regression-in-r"><span class="header-section-number">15.4</span> Multivariable regression</a>
   <ul class="collapse">
-<li><a href="#building-a-baseball-team" id="toc-building-a-baseball-team" class="nav-link" data-scroll-target="#building-a-baseball-team"><span class="header-section-number">14.4.1</span> Building a baseball team</a></li>
+<li><a href="#building-a-baseball-team" id="toc-building-a-baseball-team" class="nav-link" data-scroll-target="#building-a-baseball-team"><span class="header-section-number">15.4.1</span> Building a baseball team</a></li>
   </ul>
 </li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">14.5</span> Exercises</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">15.5</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/linear-models/multivariate-regression.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
-<h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-identifier"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></h1>
+<h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-identifier"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></h1>
 </div>
 
 
@@ -441,12 +447,12 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
   
 
 </header><p>Since Galton’s original development, regression has become one of the most widely used tools in data analysis. One reason has to do with the fact that an adaptation of the original regression approach, based on linear models, permits us to find relationships between two variables taking into account the effects of other variables that affect both. This has been particularly popular in fields where randomized experiments are hard to run, such as economics and epidemiology.</p>
-<p>When we are not able to randomly assign each individual to a treatment or control group, confounding is particularly prevalent. For example, consider estimating the effect of eating fast foods on life expectancy using data collected from a random sample of people in a jurisdiction. Fast food consumers are more likely to be smokers, drinkers, and have lower incomes. Therefore, a naive regression model may lead to an overestimate of the negative health effect of fast food. So how do we account for confounding in practice? In this chapter we learn how <em>multivariate regression</em> can help with such situations and can be used to describe how one or more variables affect an outcome variable. We illustrate with a real-world example in which data was used to help pick underappreciated players to improve a resource limited sports team.</p>
-<section id="case-study-moneyball" class="level2" data-number="14.1"><h2 data-number="14.1" class="anchored" data-anchor-id="case-study-moneyball">
-<span class="header-section-number">14.1</span> Case study: Moneyball</h2>
-<p><em>Moneyball: The Art of Winning an Unfair Game</em> is a book by Michael Lewis about the Oakland Athletics (A’s) baseball team and its general manager, the person tasked with building the team, Billy Beane.</p>
-<p>Traditionally, baseball teams use <em>scouts</em> to help them decide what players to hire. These scouts evaluate players by observing them perform. Scouts tend to favor athletic players with observable physical abilities. For this reason, scouts tend to agree on who the best players are and, as a result, these players tend to be in high demand. This in turn drives up their salaries.</p>
-<p>From 1989 to 1991, the A’s had one of the highest payrolls in baseball. They were able to buy the best players and, during that time, they were one of the best teams. However, in 1995 the A’s team owner changed and the new management cut the budget drastically, leaving then general manager, Sandy Alderson, with one of the lowest payrolls in baseball. He could no longer afford the most sought-after players. Alderson began using a statistical approach to find inefficiencies in the market. Alderson was a mentor to Billy Beane, who succeeded him in 1998 and fully embraced data science, as opposed to scouts, as a method for finding low-cost players that data predicted would help the team win. Today, this strategy has been adapted by most baseball teams. As we will see, regression plays a large role in this approach.</p>
+<p>When we are not able to randomly assign each individual to a treatment or control group, confounding is particularly prevalent. As an example, consider estimating the effect of eating fast foods on life expectancy using data collected from a random sample of people in a jurisdiction. Fast food consumers are more likely to be smokers, drinkers, and have lower incomes. Therefore, a naive regression model may lead to an overestimate of the negative health effects of fast food. So how do we account for confounding in practice? In this chapter, we learn how <em>multivariate regression</em> can help with such situations and can be used to describe how one or more variables affect an outcome variable. We illustrate with a real-world example in which data was used to help pick underappreciated players to improve a resource limited sports team.</p>
+<section id="case-study-moneyball" class="level2" data-number="15.1"><h2 data-number="15.1" class="anchored" data-anchor-id="case-study-moneyball">
+<span class="header-section-number">15.1</span> Case study: Moneyball</h2>
+<p><em>Moneyball: The Art of Winning an Unfair Game</em> by Michael Lewis focuses on the Oakland Athletics (A’s) baseball team and its general manager, Billy Beane, the person tasked with building the team.</p>
+<p>Traditionally, baseball teams use <em>scouts</em> to help them decide what players to hire. These scouts evaluate players by observing them perform, tending to favor athletic players with observable physical abilities. For this reason, scouts generally agree on who the best players are and, as a result, these players are often in high demand. This in turn drives up their salaries.</p>
+<p>From 1989 to 1991, the A’s had one of the highest payrolls in baseball. They were able to buy the best players and, during that time, were one of the best teams. However, in 1995, the A’s team owner changed and the new management cut the budget drastically, leaving then general manager, Sandy Alderson, with one of the lowest payrolls in baseball. He could no longer afford the most sought-after players. As a result, Alderson began using a statistical approach to find inefficiencies in the market. Alderson was a mentor to Billy Beane, who succeeded him in 1998 and fully embraced data science, as opposed to scouts, as a method for finding low-cost players that data predicted would help the team win. Today, this strategy has been adapted by most baseball teams. As we will see, regression plays a large role in this approach.</p>
 <p>As motivation for this part of the book, we will pretend it is 2002 and try to build a baseball team with a limited budget, just like the A’s had to do. To appreciate what you are up against, note that in 2002 the Yankees’ payroll of $125,928,583 more than tripled the Oakland A’s $39,679,746:</p>
 <div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/mlb-2002-payroll_b2e9bc1aa8e3fa2c2fc8a989178bd3fe">
 <div class="cell-output-display">
@@ -457,14 +463,14 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 </div>
 <p>Statistics have been used in baseball since its beginnings. The dataset we will be using, included in the <strong>Lahman</strong> library, goes back to the 19th century. For example, a summary statistics we will describe soon, the <em>batting average</em>, has been used for decades to summarize a batter’s success. Other statistics<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> such as home runs (HR), runs batted in (RBI), and stolen bases (SB) are reported for each player in the game summaries included in the sports section of newspapers, with players rewarded for high numbers. Although summary statistics such as these were widely used in baseball, data analysis per se was not. These statistics were arbitrarily decided on without much thought as to whether they actually predicted anything or were related to helping a team win.</p>
-<p>This changed with Bill James<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>. In the late 1970s, this aspiring writer and baseball fan started publishing articles describing more in-depth analysis of baseball data. He named the approach of using data to predict what outcomes best predicted if a team would win <em>sabermetrics</em><a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>. Until Billy Beane made sabermetrics the center of his baseball operation, Bill James’ work was mostly ignored by the baseball world. Currently, sabermetrics popularity is no longer limited to just baseball; other sports have started to use this approach as well.</p>
-<p>To simplify the exercise, we will focus on scoring runs and ignore the two other important aspects of the game: pitching and fielding. We will see how regression analysis can help develop strategies to build a competitive baseball team with a constrained budget. The approach can be divided into two separate data analyses. In the first, we determine which recorded player-specific statistics predict runs. In the second, we examine if players were undervalued based on what our first analysis predicts.</p>
-<section id="baseball-basics" class="level3" data-number="14.1.1"><h3 data-number="14.1.1" class="anchored" data-anchor-id="baseball-basics">
-<span class="header-section-number">14.1.1</span> Baseball basics</h3>
+<p>This changed with Bill James<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>. In the late 1970s, this aspiring writer and baseball fan started publishing articles describing more in-depth analysis of baseball data. He named the approach of using data to predict what outcomes best predicted if a team would win <em>sabermetrics</em><a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>. Yet until Billy Beane made sabermetrics the center of his baseball operation, Bill James’ work was mostly ignored by the baseball world. Currently, sabermetrics popularity is no longer limited to just baseball, with other sports also adopting this approach.</p>
+<p>To simplify the exercise, we will focus on scoring runs and ignore the two other important aspects of the game: pitching and fielding. We will see how regression analysis can help develop strategies to build a competitive baseball team with a constrained budget. The approach can be divided into two separate data analyses. In the first, we determine which recorded player-specific statistics predict runs. In the second, we examine if players were undervalued based on the predictions from our first analysis.</p>
+<section id="baseball-basics" class="level3" data-number="15.1.1"><h3 data-number="15.1.1" class="anchored" data-anchor-id="baseball-basics">
+<span class="header-section-number">15.1.1</span> Baseball basics</h3>
 <p>To see how regression will help us find undervalued players, we actually don’t need to understand all the details about the game of baseball, which has over 100 rules. Here, we distill the sport to the basic knowledge one needs to know how to effectively attack the data science problem.</p>
-<p>The goal of a baseball game is to score more runs (points) than the other team. Each team has 9 batters that have an opportunity to hit a ball with a bat in a predetermined order. After the 9th batter has had their turn, the first batter bats again, then the second, and so on. Each time a batter has an opportunity to bat, we call it a plate appearance (PA). At each PA, the other team’s <em>pitcher</em> throws the ball and the batter tries to hit it. The PA ends with an binary outcome: the batter either makes an <em>out</em> (failure) and returns to the bench or the batter doesn’t (success) and can run around the bases, and potentially score a run (reach all 4 bases). Each team gets nine tries, referred to as <em>innings</em>, to score runs and each inning ends after three outs (three failures).</p>
+<p>The goal of a baseball game is to score more runs (points) than the other team. Each team has 9 batters that have an opportunity to hit a ball with a bat in a predetermined order. After the 9th batter has had their turn, the first batter bats again, then the second, and so on. Each time a batter has an opportunity to bat, we call it a plate appearance (PA). At each PA, the other team’s <em>pitcher</em> throws the ball and the batter tries to hit it. The PA ends with an binary outcome: the batter either makes an <em>out</em> (failure) and returns to the bench, or the batter doesn’t (success) and can run around the bases, potentially scoring a run (reaching all 4 bases). Each team gets nine tries, referred to as <em>innings</em>, to score runs, and each inning ends after three outs (three failures).</p>
 <p>Here is a video showing a success: <a href="https://www.youtube.com/watch?v=HL-XjMCPfio" class="uri">https://www.youtube.com/watch?v=HL-XjMCPfio</a>. And here is one showing a failure: <a href="https://www.youtube.com/watch?v=NeloljCx-1g" class="uri">https://www.youtube.com/watch?v=NeloljCx-1g</a>. In these videos, we see how luck is involved in the process. When at bat, the batter wants to hit the ball hard. If the batter hits it hard enough, it is a HR, the best possible outcome as the batter gets at least one automatic run. But sometimes, due to chance, the batter hits the ball very hard and a defender catches it, resulting in an out. In contrast, sometimes the batter hits the ball softly, but it lands just in the right place. The fact that there is chance involved hints at why probability models will be involved.</p>
-<p>Now, there are several ways to succeed. Understanding this distinction will be important for our analysis. When the batter hits the ball, the batter wants to pass as many <em>bases</em> as possible. There are four bases with the fourth one called <em>home plate</em>. Home plate is where batters start by trying to hit, so the bases form a cycle.</p>
+<p>Now, there are several ways to succeed. Understanding this distinction will be important for our analysis. When the batter hits the ball, the batter wants to pass as many <em>bases</em> as possible. There are four bases, with the fourth one called <em>home plate</em>. Home plate is where batters start by trying to hit, so the bases form a cycle.</p>
 <div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-2_7bbe9ee6c5880afb93a446c0f3fe0f39">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -477,17 +483,17 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 <p>A batter who <em>goes around the bases</em> and arrives home, scores a run.</p>
 <p>We are simplifying a bit, but there are five ways a batter can succeed, that is, not make an out:</p>
 <ul>
-<li>Bases on balls (BB) - the pitcher fails to throw the ball through a predefined area considered to be hittable (the strikezone), so the batter is permitted to go to first base.</li>
+<li>Bases on balls (BB) - the pitcher fails to throw the ball through a predefined area considered to be hittable (the strike zone), so the batter is permitted to go to first base.</li>
 <li>Single - Batter hits the ball and gets to first base.</li>
 <li>Double (2B) - Batter hits the ball and gets to second base.</li>
 <li>Triple (3B) - Batter hits the ball and gets to third base.</li>
 <li>Home Run (HR) - Batter hits the ball and goes all the way home and scores a run.</li>
 </ul>
-<p>Here is an example of a HR: <a href="https://www.youtube.com/watch?v=xYxSZJ9GZ-w" class="uri">https://www.youtube.com/watch?v=xYxSZJ9GZ-w</a>. If a batter gets to a base, the batter still has a chance of getting home and scoring a run if the next batter hits successfully. While the batter is <em>on base</em>, the batter can also try to steal a base (SB). If a batter runs fast enough, the batter can try to go from one base to the next without the other team tagging the runner. Here is an example of a stolen base: <a href="https://www.youtube.com/watch?v=JSE5kfxkzfk" class="uri">https://www.youtube.com/watch?v=JSE5kfxkzfk</a>.</p>
-<p>All these events are kept track of during the season and are available to us through the <strong>Lahman</strong> package. Now we will start discussing how data analysis can help us decide how to use these statistics to evaluate players.</p>
-</section><section id="no-awards-for-bb" class="level3" data-number="14.1.2"><h3 data-number="14.1.2" class="anchored" data-anchor-id="no-awards-for-bb">
-<span class="header-section-number">14.1.2</span> No awards for BB</h3>
-<p>Historically, the <em>batting average</em> has been considered the most important offensive statistic. To define this average, we define a <em>hit</em> (H) and an <em>at bat</em> (AB). Singles, doubles, triples, and home runs are hits. The fifth way to be successful, BB, is not a hit. An AB is the number of times you either get a hit or make an out; BBs are excluded. The batting average is simply H/AB and is considered the main measure of a success rate. Today this success rate ranges from 20% to 38%. We refer to the batting average in thousands so, for example, if your success rate is 28%, we call it <em>batting 280</em>.</p>
+<p>Here is an example of a HR: <a href="https://www.youtube.com/watch?v=xYxSZJ9GZ-w" class="uri">https://www.youtube.com/watch?v=xYxSZJ9GZ-w</a>. If a batter reaches a base, the batter still has a chance of reaching home and scoring a run if the next batter succeeds with a hit. While the batter is <em>on base</em>, the batter can also try to steal a base (SB). If a batter runs fast enough, the batter can try to advance from one base to the next without the other team tagging the runner. Here is an example of a stolen base: <a href="https://www.youtube.com/watch?v=JSE5kfxkzfk" class="uri">https://www.youtube.com/watch?v=JSE5kfxkzfk</a>.</p>
+<p>All these events are tracked throughout the season and are available to us through the <strong>Lahman</strong> package. Now we will start discussing how data analysis can help us decide how to use these statistics to evaluate players.</p>
+</section><section id="no-awards-for-bb" class="level3" data-number="15.1.2"><h3 data-number="15.1.2" class="anchored" data-anchor-id="no-awards-for-bb">
+<span class="header-section-number">15.1.2</span> No awards for BB</h3>
+<p>Historically, the <em>batting average</em> has been considered the most important offensive statistic. To define this average, we define a <em>hit</em> (H) and an <em>at bat</em> (AB). Singles, doubles, triples, and home runs are hits. The fifth way to be successful, BB, is not a hit. An AB is the number of times in which you either get a hit or make an out; BBs are excluded. The batting average is simply H/AB and is considered the main measure of a success rate. Today, this success rate ranges from 20% to 38%. We refer to the batting average in thousands so, for example, if your success rate is 28%, we call it <em>batting 280</em>.</p>
 <div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-3_7dd2cf97e1bbfb125026da1928b680f8">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -497,10 +503,11 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 </div>
 <p>(Picture courtesy of Keith Allison<a href="#fn6" class="footnote-ref" id="fnref6" role="doc-noteref"><sup>6</sup></a>. CC BY-SA 2.0 license<a href="#fn7" class="footnote-ref" id="fnref7" role="doc-noteref"><sup>7</sup></a>.)</p>
-<p>One of Bill James’ first important insights is that the batting average ignores BB, but a BB is a success. He proposed we use the <em>on base percentage</em> (OBP) instead of batting average. He defined OBP as (H+BB)/(AB+BB) which is simply the proportion of plate appearances that don’t result in an out, a very intuitive measure. He noted that a player that gets many more BB than the average player might not be recognized if the batter does not excel in batting average. But is this player not helping produce runs? No award is given to the player with the most BB. However, bad habits are hard to break and baseball did not immediately adopt OBP as an important statistic. In contrast, total stolen bases were considered important and an award<a href="#fn8" class="footnote-ref" id="fnref8" role="doc-noteref"><sup>8</sup></a> given to the player with the most. But players with high totals of SB also made more outs as they did not always succeed. Does a player with high SB total help produce runs? Can we use data science to determine if it’s better to pay for players with high BB or SB?</p>
-</section><section id="base-on-balls-or-stolen-bases" class="level3" data-number="14.1.3"><h3 data-number="14.1.3" class="anchored" data-anchor-id="base-on-balls-or-stolen-bases">
-<span class="header-section-number">14.1.3</span> Base on balls or stolen bases?</h3>
-<p>One of the challenges in this analysis is that it is not obvious how to determine if a player produces runs because so much depends on his teammates. We do keep track of the number of runs scored by a player. However, remember that if a player X bats right before someone who hits many HRs, batter X will score many runs. But these runs don’t necessarily happen if we hire player X but not his HR hitting teammate. However, we can examine team-level statistics. How do teams with many SB compare to teams with few? How about BB? We have data! Let’s examine some. We start by creating with statistics from 1962, the first year all teams played 162 games (like today) instead of 154, to 2001, the year before the year for which we will construct a team. We convert the data to a <em>per game</em> rate because a small proportion of seasons had less games than usual due to strikes, and some teams played extra games due to tie breakers.</p>
+<p>One of Bill James’ first important insights is that the batting average ignores BB, but a BB is a success. Instead of batting average, James proposed the use of the <em>on base percentage</em> (OBP), which he defined as (H+BB)/(AB+BB), or simply the proportion of plate appearances that don’t result in an out, a very intuitive measure. He noted that a player that accumulates many more BB than the average player might go unrecognized if the batter does not excel in batting average. But is this player not helping produce runs? No award is given to the player with the most BB. However, bad habits are hard to break and baseball did not immediately adopt OBP as an important statistic. In contrast, total stolen bases were considered important and an award<a href="#fn8" class="footnote-ref" id="fnref8" role="doc-noteref"><sup>8</sup></a> given to the player with the most. But players with high totals of SB also made more outs as they did not always succeed. Does a player with high SB total help produce runs? Can we use data science to determine if it’s better to pay for players with high BB or SB?</p>
+</section><section id="base-on-balls-or-stolen-bases" class="level3" data-number="15.1.3"><h3 data-number="15.1.3" class="anchored" data-anchor-id="base-on-balls-or-stolen-bases">
+<span class="header-section-number">15.1.3</span> Base on balls or stolen bases?</h3>
+<p>One of the challenges in this analysis is that it is not obvious how to determine if a player produces runs because so much depends on his teammates. Although we keep track of the number of runs scored by a player, remember that if player X bats right before someone who hits many HRs, batter X will score many runs. Note these runs don’t necessarily happen if we hire player X, but not his HR hitting teammate.</p>
+<p>However, we can examine team-level statistics. How do teams with many SB compare to teams with few? How about BB? We have data! Let’s examine some. We start by creating a data frame with statistics from 1962 (the first year all teams played 162 games, like today, instead of 154) to 2001 (the year before the year for which we will construct a team). We convert the data to a <em>per game</em> rate, because a small proportion of seasons had less games than usual due to strikes, and some teams played extra games due to tie breakers.</p>
 <div class="cell" data-layout-align="center">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
 <span><span class="co">#&gt; ── Attaching core tidyverse packages ──────────────── tidyverse 2.0.0 ──</span></span>
@@ -515,11 +522,12 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 <span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://CRAN.R-project.org/package=Lahman">Lahman</a></span><span class="op">)</span></span>
 <span><span class="va">dat</span> <span class="op">&lt;-</span> <span class="va">Teams</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">yearID</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="fl">1962</span><span class="op">:</span><span class="fl">2002</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>team <span class="op">=</span> <span class="va">teamID</span>, year <span class="op">=</span> <span class="va">yearID</span>, r <span class="op">=</span> <span class="va">R</span><span class="op">/</span><span class="va">G</span>, </span>
-<span>         singles <span class="op">=</span> <span class="op">(</span><span class="va">H</span> <span class="op">-</span> <span class="va">X2B</span> <span class="op">-</span> <span class="va">X3B</span> <span class="op">-</span> <span class="va">HR</span><span class="op">)</span><span class="op">/</span><span class="va">G</span>, doubles <span class="op">=</span> <span class="va">X2B</span><span class="op">/</span><span class="va">G</span>, triples <span class="op">=</span> <span class="va">X3B</span><span class="op">/</span><span class="va">G</span>, hr <span class="op">=</span> <span class="va">HR</span><span class="op">/</span><span class="va">G</span>,</span>
+<span>         singles <span class="op">=</span> <span class="op">(</span><span class="va">H</span> <span class="op">-</span> <span class="va">X2B</span> <span class="op">-</span> <span class="va">X3B</span> <span class="op">-</span> <span class="va">HR</span><span class="op">)</span><span class="op">/</span><span class="va">G</span>, </span>
+<span>         doubles <span class="op">=</span> <span class="va">X2B</span><span class="op">/</span><span class="va">G</span>, triples <span class="op">=</span> <span class="va">X3B</span><span class="op">/</span><span class="va">G</span>, hr <span class="op">=</span> <span class="va">HR</span><span class="op">/</span><span class="va">G</span>,</span>
 <span>         sb <span class="op">=</span> <span class="va">SB</span><span class="op">/</span><span class="va">G</span>, bb <span class="op">=</span> <span class="va">BB</span><span class="op">/</span><span class="va">G</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">team</span>, <span class="va">year</span>, <span class="va">r</span>, <span class="va">singles</span>, <span class="va">doubles</span>, <span class="va">triples</span>, <span class="va">hr</span>, <span class="va">sb</span>, <span class="va">bb</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Now let’s start with a obvious question: does teams that hit more home runs score more runs? The visualization of choice when exploring the relationship between two variables is a scatter plot.</p>
+<p>Now let’s start with a obvious question: do teams that hit more home runs score more runs? When exploring the relationship between two variables, The visualization of choice is a scatterplot:</p>
 <div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/runs-vs-hrs_ae55c10a54e62946759f245a736dc9cf">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p</span> <span class="op">&lt;-</span> <span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">hr</span>, <span class="va">r</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span>alpha <span class="op">=</span> <span class="fl">0.5</span><span class="op">)</span></span>
 <span><span class="va">p</span> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -530,7 +538,7 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>We defined <code>p</code> because we will add to this plot latter. The plot shows a strong association: teams with more HRs tend to score more runs. Now let’s examine the relationship between stolen bases and runs:</p>
+<p>We defined <code>p</code> because we will add it to this plot latter. The plot shows a strong association: teams with more HRs tend to score more runs. Now let’s examine the relationship between stolen bases and runs:</p>
 <div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/runs-vs-sb_830566348b7af3f6d748f853e94f445e">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">sb</span>, <span class="va">r</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span>alpha <span class="op">=</span> <span class="fl">0.5</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
@@ -560,10 +568,10 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>We know that HRs cause runs because when a player hits a HR they are guaranteed at least one run. Could it be that HRs also cause BB and this makes it appear as if BB cause runs? When this happens we say there is <em>confounding</em>, an important concept we will learn more about throughout this chapter.</p>
-<p>Linear regression will help us parse all this out and quantify the associations. This will then help us determine what players to recruit. Specifically, we will try to predict things like how many more runs will a team score if we increase the number of BBs, but keep the HRs fixed? Regression will help us answer questions like this one.</p>
-</section><section id="regression-applied-to-baseball-statistics" class="level3" data-number="14.1.4"><h3 data-number="14.1.4" class="anchored" data-anchor-id="regression-applied-to-baseball-statistics">
-<span class="header-section-number">14.1.4</span> Regression applied to baseball statistics</h3>
+<p>We know that HRs cause runs because when a player hits a HR, they are guaranteed at least one run. Could it be that HRs also cause BB and this makes it appear as if BB cause runs? When this happens, we say there is <em>confounding</em>, an important concept we will learn more about throughout this section.</p>
+<p>Linear regression will help us parse out the information and quantify the associations. This, in turn, will aid us in determining what players to recruit. Specifically, we will try to predict things like how many more runs will a team score if we increase the number of BBs, but keep the HRs fixed? Regression will help us answer questions like this one.</p>
+</section><section id="regression-applied-to-baseball-statistics" class="level3" data-number="15.1.4"><h3 data-number="15.1.4" class="anchored" data-anchor-id="regression-applied-to-baseball-statistics">
+<span class="header-section-number">15.1.4</span> Regression applied to baseball statistics</h3>
 <p>Can we use regression with these data? First, notice that the HR and Run data, shown above, appear to be bivariate normal. Specifically, the qq-plots confirm that the normal approximation for each HR strata is useful here:</p>
 <div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/hr-by-runs-qq_dd8bfaf678872239efb8372529af1d5f">
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>z_hr <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/scale.html">scale</a></span><span class="op">(</span><span class="va">hr</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
@@ -578,7 +586,7 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>Now we are ready to use linear regression to predict the number of runs a team will score if we know how many home runs the team hits using regression:</p>
+<p>Now we are ready to use linear regression to predict the number of runs a team will score, if we know how many home runs the team hits using regression:</p>
 <div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/hr-versus-runs-regression_e7678d87f03682694e9b35894e2ad535">
 <div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">hr_fit</span>  <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">r</span> <span class="op">~</span> <span class="va">hr</span>, data <span class="op">=</span> <span class="va">dat</span><span class="op">)</span><span class="op">$</span><span class="va">coef</span></span>
 <span><span class="va">p</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_abline.html">geom_abline</a></span><span class="op">(</span>intercept <span class="op">=</span> <span class="va">hr_fit</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span>, slope <span class="op">=</span> <span class="va">hr_fit</span><span class="op">[[</span><span class="fl">2</span><span class="op">]</span><span class="op">]</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -589,7 +597,7 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>Note that we can obtain the same plot quicker by using the <strong>ggplot2</strong> function <code>geom_smooth</code> which computes and adds a regression line to plot along with confidence intervals. We use the argument <code>method = "lm"</code> which stands for <em>linear model</em>, the title of an upcoming section. So we can simplify the code above like this:</p>
+<p>Note that we can obtain the same plot more quickly by using the <strong>ggplot2</strong> function <code>geom_smooth</code>, which computes and adds a regression line to plot along with confidence intervals. We use the argument <code>method = "lm"</code>, which stands for <em>linear model</em>, the title of an upcoming section. We simplify the code above like this:</p>
 <div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/hr-versus-runs-regression-easy_f9e0a9a4bfb963473eb6938ee2f4730f">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_smooth.html">geom_smooth</a></span><span class="op">(</span>method <span class="op">=</span> <span class="st">"lm"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
@@ -599,10 +607,10 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>In the example above, the slope is 1.8517449. So this tells us that teams that hit 1 more HR per game than the average team, score 1.8517449 more runs per game than the average team. Given that the most common final score is a difference of a run, this can certainly lead to a large increase in wins. Not surprisingly, HR hitters are very expensive. Because we are working on a budget, we will need to find some other way to increase wins. In the next chapter, we introduce <em>linear models</em>, which provide an framework for performing this analysis. In chapter @ref{<span class="citation" data-cites="moneyball">@moneyball</span>} we apply what have learned to build a baseball team.</p>
-</section></section><section id="the-broom-package" class="level2" data-number="14.2"><h2 data-number="14.2" class="anchored" data-anchor-id="the-broom-package">
-<span class="header-section-number">14.2</span> The broom package</h2>
-<p>The <strong>broom</strong> package facilitates the use of R function such as <code>lm</code> within the tidyverse. Recall the that <code>lm</code> does not take a data frame as a first argument and does not return a data frame, which makes using <code>lm</code> in conjunction with the <strong>tidyverse</strong> difficult. It has three main functions, all of which extract information from the object returned by <code>lm</code> and returns it in a <strong>tidyverse</strong> friendly data frame. These functions are <code>tidy</code>, <code>glance</code>, and <code>augment</code>. The <code>tidy</code> function returns estimates and related information as a data frame:</p>
+<p>In the example above, the slope is 1.8517449. This tells us that teams that hit 1 more HR per game than the average team, score 1.8517449 more runs per game than the average team. Given that the most common final score is a difference of one run, this can certainly lead to a large increase in wins. Not surprisingly, HR hitters are very expensive. Because we are working on a budget, we will need to find some other way to increase wins. In the next chapter, we introduce <em>linear models</em>, which provide an framework for performing this analysis. In @ref{<span class="citation" data-cites="moneyball">@moneyball</span>}, we apply what have learned to build a baseball team.</p>
+</section></section><section id="the-broom-package" class="level2" data-number="15.2"><h2 data-number="15.2" class="anchored" data-anchor-id="the-broom-package">
+<span class="header-section-number">15.2</span> The broom package</h2>
+<p>The <strong>broom</strong> package facilitates the use of R function, such as <code>lm</code>, within the tidyverse. Recall the that <code>lm</code> does not take a data frame as a first argument and does not return a data frame, which makes using <code>lm</code> in conjunction with the <strong>tidyverse</strong> difficult. It has three main functions, all of which extract information from the object returned by <code>lm</code> and returns it in a <strong>tidyverse</strong> friendly data frame. These functions are: <code>tidy</code>, <code>glance</code>, and <code>augment</code>. The <code>tidy</code> function returns estimates and related information as a data frame:</p>
 <div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-5_0be28ace367b248bac94eb7f765e71e8">
 <div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://broom.tidymodels.org/">broom</a></span><span class="op">)</span></span>
 <span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">r</span> <span class="op">~</span> <span class="va">bb</span>, data <span class="op">=</span> <span class="va">dat</span><span class="op">)</span></span>
@@ -622,48 +630,38 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 <span><span class="co">#&gt; 1 (Intercept)    1.93     0.116       16.7 1.91e-55    1.70      2.15 </span></span>
 <span><span class="co">#&gt; 2 bb             0.739    0.0348      21.2 1.90e-83    0.671     0.807</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Because the outcome is a data frame, we can immediately use it with <code>summarize</code> to string together the commands that produce the table we are after. Because a data frame is returned, we can filter and select the rows and columns we want, as we will see in the next section.</p>
+<p>Given that the outcome is a data frame, we can immediately use it with <code>summarize</code> to string together the commands that produce the table we are after. Because a data frame is returned, we can filter and select the rows and columns we want, as we will see in the next section.</p>
 <p>Now we return to discussing our original task of determining if slopes changed. The plot we just made, using <code>summarize</code> and <code>tidy</code>, shows that the confidence intervals overlap, which provides a nice visual confirmation that our assumption that the slope does not change is safe.</p>
-<p>The other functions provided by <strong>broom</strong>, <code>glance</code>, and <code>augment</code>, relate to model-specific and observation-specific outcomes, respectively. Here, we can see the model fit summaries <code>glance</code> returns:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-7_f8722febe44c207fa51aa2da8b2478e5">
-<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://generics.r-lib.org/reference/glance.html">glance</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span></span>
-<span><span class="co">#&gt; # A tibble: 1 × 12</span></span>
-<span><span class="co">#&gt;   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC</span></span>
-<span><span class="co">#&gt;       &lt;dbl&gt;         &lt;dbl&gt; &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt; &lt;dbl&gt;  &lt;dbl&gt; &lt;dbl&gt;</span></span>
-<span><span class="co">#&gt; 1     0.304         0.303 0.493      451. 1.90e-83     1  -737. 1480.</span></span>
-<span><span class="co">#&gt; # ℹ 4 more variables: BIC &lt;dbl&gt;, deviance &lt;dbl&gt;, df.residual &lt;int&gt;,</span></span>
-<span><span class="co">#&gt; #   nobs &lt;int&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>You can learn more about these summaries in any regression text book.</p>
-</section><section id="confounding" class="level2" data-number="14.3"><h2 data-number="14.3" class="anchored" data-anchor-id="confounding">
-<span class="header-section-number">14.3</span> Confounding</h2>
+<p>The other functions provided by <strong>broom</strong>, <code>glance</code> and <code>augment</code>, relate to model-specific and observation-specific outcomes, respectively.</p>
+</section><section id="confounding" class="level2" data-number="15.3"><h2 data-number="15.3" class="anchored" data-anchor-id="confounding">
+<span class="header-section-number">15.3</span> Confounding</h2>
 <p>Previously, we noted a strong relationship between Runs and BB. If we find the regression line for predicting runs from bases on balls, we a get slope of:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-8_16c2f15780d41b991bbeac59cefc0707">
-<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">bb_slope</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">r</span> <span class="op">~</span> <span class="va">bb</span>, data <span class="op">=</span> <span class="va">dat</span><span class="op">)</span><span class="op">$</span><span class="va">coef</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span></span>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-7_b97ef3c77d41ced3f776c331ac75ee97">
+<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">bb_slope</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">r</span> <span class="op">~</span> <span class="va">bb</span>, data <span class="op">=</span> <span class="va">dat</span><span class="op">)</span><span class="op">$</span><span class="va">coef</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span></span>
 <span><span class="va">bb_slope</span> </span>
 <span><span class="co">#&gt;    bb </span></span>
 <span><span class="co">#&gt; 0.739</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>So does this mean that if we go and hire low salary players with many BB, and who therefore increase the number of walks per game by 2, our team will score 1.5 more runs per game?</p>
+<p>Does this mean that if we go and hire low salary players with many BB, and who therefore increase the number of walks per game by 2, our team will score 1.5 more runs per game?</p>
 <p>We are again reminded that association is not causation. The data does provide strong evidence that a team with two more BB per game than the average team, scores 1.5 runs per game. But this does not mean that BB are the cause.</p>
-<p>Note that if we compute the regression line slope for singles we get:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-9_539daee69f5190a115a3fee27ede9cf1">
-<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">r</span> <span class="op">~</span> <span class="va">singles</span>, data <span class="op">=</span> <span class="va">dat</span><span class="op">)</span><span class="op">$</span><span class="va">coef</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span></span>
+<p>Note that, if we compute the regression line slope for singles, we get:</p>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-8_9e09b60f56c6ea8c93aba63a0520d7e8">
+<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">r</span> <span class="op">~</span> <span class="va">singles</span>, data <span class="op">=</span> <span class="va">dat</span><span class="op">)</span><span class="op">$</span><span class="va">coef</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span></span>
 <span><span class="co">#&gt; singles </span></span>
 <span><span class="co">#&gt;   0.432</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>which is a lower value than what we obtain for BB. Notice that a single gets you to first base just like a BB. Those that know about baseball will tell you that with a single, runners on base have a better chance of scoring than with a BB. So how can BB be more predictive of runs? The reason this happen is because of confounding. Here we show the correlation between HR, BB, and singles:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-10_5d188ac2ee75efcb7f4d4d14fafdad76">
-<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">bb</span>, <span class="va">hr</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">singles</span>, <span class="va">hr</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">bb</span>, <span class="va">singles</span><span class="op">)</span><span class="op">)</span></span>
+<p>which is a lower value than what we obtain for BB. Remember that a single gets you to first base just like a BB. Baseball fans will point out that with a single, runners on base have a better chance of scoring than with a BB. So how can BB be more predictive of runs? The reason is because of confounding. Here we show the correlation between HR, BB, and singles:</p>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-9_ef5a1d6c2882aca30cdd168170daf220">
+<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">bb</span>, <span class="va">hr</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">singles</span>, <span class="va">hr</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">bb</span>, <span class="va">singles</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt;   cor(bb, hr) cor(singles, hr) cor(bb, singles)</span></span>
 <span><span class="co">#&gt; 1       0.406           -0.186          -0.0513</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>It turns out that pitchers, afraid of HRs, will sometimes avoid throwing strikes to HR hitters. As a result, HR hitters tend to have more BBs and a team with many HRs will also have more BBs. Although it may appear that BBs cause runs, it is actually the HRs that cause most of these runs. We say that BBs are <em>confounded</em> with HRs. Nonetheless, could it be that BBs still help? To find out, we somehow have to adjust for the HR effect. Regression can help with this as well.</p>
-<section id="understanding-confounding-through-stratification" class="level3" data-number="14.3.1"><h3 data-number="14.3.1" class="anchored" data-anchor-id="understanding-confounding-through-stratification">
-<span class="header-section-number">14.3.1</span> Understanding confounding through stratification</h3>
+<p>It appears that pitchers, afraid of HRs, will sometimes avoid throwing strikes to HR hitters. As a result, HR hitters tend to have more BBs, and a team with many HRs will also have more BBs. Although it may appear that BBs cause runs, it is actually the HRs that cause most of these runs. We say that BBs are <em>confounded</em> with HRs. Nonetheless, could it be that BBs still help? To find out, we somehow have to adjust for the HR effect. Regression can help with this as well.</p>
+<section id="understanding-confounding-through-stratification" class="level3" data-number="15.3.1"><h3 data-number="15.3.1" class="anchored" data-anchor-id="understanding-confounding-through-stratification">
+<span class="header-section-number">15.3.1</span> Understanding confounding through stratification</h3>
 <p>A first approach is to keep HRs fixed at a certain value and then examine the relationship between BB and runs. As we did when we stratified fathers by rounding to the closest inch, here we can stratify HR per game to the closest ten. We filter out the strata with few points to avoid highly variable estimates and then make a scatterplot for each strata:</p>
 <div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/runs-vs-bb-by-hr-strata_d5857b198d174227a22f96dc15ce532f">
-<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>hr_strata <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">hr</span>, <span class="fl">1</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>hr_strata <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">hr</span>, <span class="fl">1</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">hr_strata</span> <span class="op">&gt;=</span> <span class="fl">0.4</span> <span class="op">&amp;</span> <span class="va">hr_strata</span> <span class="op">&lt;=</span> <span class="fl">1.2</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">bb</span>, <span class="va">r</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span>  </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span>alpha <span class="op">=</span> <span class="fl">0.5</span><span class="op">)</span> <span class="op">+</span></span>
@@ -677,8 +675,8 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 </div>
 <p>Remember that the regression slope for predicting runs with BB was 0.7. Once we stratify by HR, these slopes are substantially reduced:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-11_a57773b27bb74c6c6092836dd0ccdda6">
-<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>hr_strata <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">hr</span>, <span class="fl">1</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-10_02632bd7928eb49aa6d879232223271f">
+<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>hr_strata <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">hr</span>, <span class="fl">1</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">hr_strata</span> <span class="op">&gt;=</span> <span class="fl">0.5</span> <span class="op">&amp;</span> <span class="va">hr_strata</span> <span class="op">&lt;=</span> <span class="fl">1.2</span><span class="op">)</span> <span class="op">|&gt;</span>  </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">hr_strata</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/reframe.html">reframe</a></span><span class="op">(</span><span class="fu"><a href="https://generics.r-lib.org/reference/tidy.html">tidy</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">r</span> <span class="op">~</span> <span class="va">bb</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
@@ -704,9 +702,9 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 </div>
 <p>The slopes are reduced, but they are not 0, which indicates that BBs are helpful for producing runs, just not as much as previously thought. In fact, the values above are closer to the slope we obtained from singles, 0.4, which is more consistent with our intuition. Since both singles and BB get us to first base, they should have about the same predictive power.</p>
-<p>Although our understanding of the application tells us that HR cause BB but not the other way around, we can still check if stratifying by BB makes the effect of BB go down. To do this, we use the same code except that we swap HR and BBs. In this case, the slopes do not change much from the original:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-12_1b6e1e95e524560d541cc56630c1c774">
-<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>bb_strata <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">bb</span>, <span class="fl">1</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<p>Although our understanding of the application tells us that HR cause BB, but not the other way around, we can still check if stratifying by BB makes the effect of BB go down. To do this, we use the same code except that we swap HR and BBs. In this case, the slopes do not change much from the original:</p>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-11_9cbc6453716018c3ce3f3c0187e936de">
+<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>bb_strata <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">bb</span>, <span class="fl">1</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">bb_strata</span> <span class="op">&gt;=</span> <span class="fl">3</span> <span class="op">&amp;</span> <span class="va">bb_strata</span> <span class="op">&lt;=</span> <span class="fl">4</span><span class="op">)</span> <span class="op">|&gt;</span>  </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">bb_strata</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/reframe.html">reframe</a></span><span class="op">(</span><span class="fu"><a href="https://generics.r-lib.org/reference/tidy.html">tidy</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">r</span> <span class="op">~</span> <span class="va">hr</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
@@ -721,27 +719,27 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 <span><span class="co">#&gt; 5       3.4 hr        1.55     0.153     10.1  3.77e-16</span></span>
 <span><span class="co">#&gt; # ℹ 6 more rows</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>They are reduced a bit from 1.8517449, which is consistent with the fact that BB do in fact cause some runs.</p>
+<p>They are reduced slightly from 1.8517449, which is consistent with the fact that BB do in fact cause some runs.</p>
 <p>Regardless, it seems that if we stratify by HR, we have bivariate distributions for runs versus BB. Similarly, if we stratify by BB, we have approximate bivariate normal distributions for HR versus runs.</p>
-</section></section><section id="sec-regression-in-r" class="level2" data-number="14.4"><h2 data-number="14.4" class="anchored" data-anchor-id="sec-regression-in-r">
-<span class="header-section-number">14.4</span> Multivariable regression</h2>
+</section></section><section id="sec-regression-in-r" class="level2" data-number="15.4"><h2 data-number="15.4" class="anchored" data-anchor-id="sec-regression-in-r">
+<span class="header-section-number">15.4</span> Multivariable regression</h2>
 <p>It is somewhat complex to be computing regression lines for each strata. We are essentially fitting models like this:</p>
 <p><span class="math display">\[
 \mbox{E}[R \mid BB = x_1, \, HR = x_2] = \beta_0 + \beta_1(x_2) x_1 + \beta_2(x_1) x_2
 \]</span></p>
 <p>with the slopes for <span class="math inline">\(x_1\)</span> changing for different values of <span class="math inline">\(x_2\)</span> and vice versa. But is there an easier approach?</p>
-<p>If we take random variability into account, the slopes in the strata don’t appear to change much. If these slopes are in fact the same, this implies that <span class="math inline">\(\beta_1(x_2)\)</span> and <span class="math inline">\(\beta_2(x_1)\)</span> are constants. This in turn implies that the expectation of runs conditioned on HR and BB can be written like this:</p>
+<p>If we take random variability into account, the slopes in the strata don’t appear to change much. If these slopes are in fact the same, this implies that <span class="math inline">\(\beta_1(x_2)\)</span> and <span class="math inline">\(\beta_2(x_1)\)</span> are constants. This, in turn, implies that the expectation of runs conditioned on HR and BB can be written as follows:</p>
 <p><span class="math display">\[
 \mbox{E}[R \mid BB = x_1, \, HR = x_2] = \beta_0 + \beta_1 x_1 + \beta_2 x_2
 \]</span></p>
-<p>This model suggests that if the number of HR is fixed at <span class="math inline">\(x_2\)</span>, we observe a linear relationship between runs and BB with an intercept of <span class="math inline">\(\beta_0 + \beta_2 x_2\)</span>. Our exploratory data analysis suggested that this is the case. The model also suggests that as the number of HR grows, the intercept growth is linear as well and determined by <span class="math inline">\(\beta_1\)</span>. In this analysis, referred to as <em>multivariable regression</em>, you will often hear people say that the BB slope <span class="math inline">\(\beta_1\)</span> is <em>adjusted</em> for the HR effect.</p>
-<p>Because the data is approximately normal and conditional distributions were also normal we are justified in using a linear model:</p>
+<p>This model suggests that, if the number of HR is fixed at <span class="math inline">\(x_2\)</span>, we observe a linear relationship between runs and BB with an intercept of <span class="math inline">\(\beta_0 + \beta_2 x_2\)</span>. Our exploratory data analysis suggested that this is the case. The model also suggests that as the number of HR grows, the intercept growth is linear as well and determined by <span class="math inline">\(\beta_1\)</span>. In this analysis, referred to as <em>multivariable regression</em>, you will often hear people say that the BB slope <span class="math inline">\(\beta_1\)</span> is <em>adjusted</em> for the HR effect.</p>
+<p>Because the data is approximately normal and conditional distributions were also normal, we are justified in using a linear model:</p>
 <p><span class="math display">\[
 Y_i = \beta_0 + \beta_1 x_{i,1} + \beta_2 x_{i,2} + \varepsilon_i
 \]</span></p>
-<p>with <span class="math inline">\(Y_i\)</span> runs per game for team <span class="math inline">\(i\)</span>, <span class="math inline">\(x_{i,1}\)</span> walks per game, and <span class="math inline">\(x_{i,2}\)</span>. To use <code>lm</code> here, we need to let the function know we have two predictor variables. So we use the <code>+</code> symbol as follows:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-13_2a7ff2c61a38d38b627945af3fe3e0c6">
-<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://generics.r-lib.org/reference/tidy.html">tidy</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">r</span> <span class="op">~</span> <span class="va">bb</span> <span class="op">+</span> <span class="va">hr</span>, data <span class="op">=</span> <span class="va">dat</span><span class="op">)</span>, conf.int <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> </span>
+<p>with <span class="math inline">\(Y_i\)</span> runs per game for team <span class="math inline">\(i\)</span>, <span class="math inline">\(x_{i,1}\)</span> walks per game, and <span class="math inline">\(x_{i,2}\)</span>. To use <code>lm</code> here, we need to let the function know we have two predictor variables. We therefore use the <code>+</code> symbol as follows:</p>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-12_d7f59ac60d1d9e7902a47ca8583ee609">
+<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://generics.r-lib.org/reference/tidy.html">tidy</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">r</span> <span class="op">~</span> <span class="va">bb</span> <span class="op">+</span> <span class="va">hr</span>, data <span class="op">=</span> <span class="va">dat</span><span class="op">)</span>, conf.int <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> </span>
 <span><span class="co">#&gt; # A tibble: 3 × 7</span></span>
 <span><span class="co">#&gt;   term        estimate std.error statistic   p.value conf.low conf.high</span></span>
 <span><span class="co">#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;</span></span>
@@ -756,24 +754,24 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>You are ready to do exercises 1-12 if you want to practice before continuing.</p>
+<p>You are ready to do exercises 1-12, if you want to practice before continuing.</p>
 </div>
 </div>
 </div>
-<section id="building-a-baseball-team" class="level3" data-number="14.4.1"><h3 data-number="14.4.1" class="anchored" data-anchor-id="building-a-baseball-team">
-<span class="header-section-number">14.4.1</span> Building a baseball team</h3>
-<p>Now we want to construct a metric to pick players, and we need to consider singles, doubles, and triples as well. Can we build a model that predicts runs based on all these outcomes? We take somewhat of a “leap of faith” and assume that these five variables are jointly normal. This means that if we pick any one of them, and hold the other four fixed, the relationship with the outcome is linear and the slope does not depend on the four values held constant. If this is true, then a linear model for our data is:</p>
+<section id="building-a-baseball-team" class="level3" data-number="15.4.1"><h3 data-number="15.4.1" class="anchored" data-anchor-id="building-a-baseball-team">
+<span class="header-section-number">15.4.1</span> Building a baseball team</h3>
+<p>Now we want to construct a metric to pick players, and we need to consider singles, doubles, and triples as well. Can we build a model that predicts runs based on all these outcomes? We take somewhat of a “leap of faith” and assume that these five variables are jointly normal. This means that, if we pick any one of them and hold the other four fixed, the relationship with the outcome is linear and the slope does not depend on the four values held constant. If this is true, then a linear model for our data is:</p>
 <p><span class="math display">\[
 Y_i = \beta_0 + \beta_1 x_{i,1} + \beta_2 x_{i,2} + \beta_3 x_{i,3}+ \beta_4 x_{i,4} + \beta_5 x_{i,5} + \varepsilon_i
 \]</span></p>
 <p>with <span class="math inline">\(x_{i,1}, x_{i,2}, x_{i,3}, x_{i,4}, x_{i,5}\)</span> representing BB, singles, doubles, triples, and HR respectively.</p>
 <p>Using <code>lm</code>, we can quickly find the LSE for the parameters using:</p>
 <div class="cell" data-layout-align="center">
-<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="va">dat</span> <span class="op">|&gt;</span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">year</span> <span class="op">&lt;=</span> <span class="fl">2001</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">r</span> <span class="op">~</span> <span class="va">bb</span> <span class="op">+</span> <span class="va">singles</span> <span class="op">+</span> <span class="va">doubles</span> <span class="op">+</span> <span class="va">triples</span> <span class="op">+</span> <span class="va">hr</span>, data <span class="op">=</span> <span class="va">_</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="va">dat</span> <span class="op">|&gt;</span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">year</span> <span class="op">&lt;=</span> <span class="fl">2001</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">r</span> <span class="op">~</span> <span class="va">bb</span> <span class="op">+</span> <span class="va">singles</span> <span class="op">+</span> <span class="va">doubles</span> <span class="op">+</span> <span class="va">triples</span> <span class="op">+</span> <span class="va">hr</span>, data <span class="op">=</span> <span class="va">_</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Note we fit the model to data up until 2001, the year before we will construct our team. We can see the coefficients using <code>tidy</code>:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-15_f712c8428a75a017290e99efeccd2c56">
-<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://generics.r-lib.org/reference/tidy.html">tidy</a></span><span class="op">(</span><span class="va">fit</span>, conf.int <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">term</span> <span class="op">!=</span> <span class="st">"(Intercept)"</span><span class="op">)</span></span>
+<p>Be aware that we fit the model to data up until 2001, the year before we will construct our team. We can see the coefficients using <code>tidy</code>:</p>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-14_f10ea48356354127f518451282725424">
+<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://generics.r-lib.org/reference/tidy.html">tidy</a></span><span class="op">(</span><span class="va">fit</span>, conf.int <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">term</span> <span class="op">!=</span> <span class="st">"(Intercept)"</span><span class="op">)</span></span>
 <span><span class="co">#&gt; # A tibble: 5 × 7</span></span>
 <span><span class="co">#&gt;   term    estimate std.error statistic   p.value conf.low conf.high</span></span>
 <span><span class="co">#&gt;   &lt;chr&gt;      &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;</span></span>
@@ -785,7 +783,7 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 <p>To see how well our metric actually predicts runs, we can predict the number of runs for each team in 2002 using the function <code>predict</code>, then make a plot:</p>
 <div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/model-predicts-runs_01b78ca9487e9ef92a5ccec309037327">
-<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>r_hat <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit</span>, newdata <span class="op">=</span> <span class="va">dat</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>r_hat <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit</span>, newdata <span class="op">=</span> <span class="va">dat</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">year</span> <span class="op">==</span> <span class="fl">2002</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html">%&gt;%</a></span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">r_hat</span>, <span class="va">r</span>, label <span class="op">=</span> <span class="va">team</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span></span>
@@ -798,20 +796,20 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>Our model does quite a good job as demonstrated by the fact that points from the observed versus predicted plot fall close to the identity line.</p>
-<p>So instead of using batting average, or just number of HR, as a measure of picking players, we can use our fitted model to form a metric that relates more directly to run production. Specifically, to define a metric for player A, we imagine a team made up of players just like player A and use our fitted regression model to predict how many runs this team would produce. The formula would look like this: -2.7580763 + 0.3699921 <span class="math inline">\(\times\)</span> BB + 0.5174284 <span class="math inline">\(\times\)</span> singles + 0.7750757 <span class="math inline">\(\times\)</span> doubles + 1.2387738 <span class="math inline">\(\times\)</span> triples + 1.4419724 <span class="math inline">\(\times\)</span> HR.</p>
+<p>Our model does quite a good job, as demonstrated by the fact that points from the observed versus predicted plot fall close to the identity line.</p>
+<p>So instead of using batting average, or just number of HR, as a measure of picking players, we can use our fitted model to form a metric that relates more directly to run production. Specifically, to define a metric for player A, we imagine a team made up of players just like player A, and use our fitted regression model to predict how many runs this team would produce. The formula would look like this: -2.7580763 + 0.3699921 <span class="math inline">\(\times\)</span> BB + 0.5174284 <span class="math inline">\(\times\)</span> singles + 0.7750757 <span class="math inline">\(\times\)</span> doubles + 1.2387738 <span class="math inline">\(\times\)</span> triples + 1.4419724 <span class="math inline">\(\times\)</span> HR.</p>
 <p>To define a player-specific metric, we have a bit more work to do. A challenge here is that we derived the metric for teams, based on team-level summary statistics. For example, the HR value that is entered into the equation is HR per game for the entire team. If we compute the HR per game for a player, it will be much lower since the total is accumulated by 9 batters. Furthermore, if a player only plays part of the game and gets fewer opportunities than average, it is still considered a game played. For players, a rate that takes into account opportunities is the per-plate-appearance rate.</p>
 <p>To make the per-game team rate comparable to the per-plate-appearance player rate, we compute the average number of team plate appearances per game:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-16_6976a0df76d282170b2a76c470b9b562">
-<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pa_per_game</span> <span class="op">&lt;-</span> <span class="va">Batting</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">yearID</span> <span class="op">==</span> <span class="fl">2002</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-15_d84fdc635be3cc5cb182ad3a79643397">
+<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pa_per_game</span> <span class="op">&lt;-</span> <span class="va">Batting</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">yearID</span> <span class="op">==</span> <span class="fl">2002</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">teamID</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>pa_per_game <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">AB</span> <span class="op">+</span> <span class="va">BB</span><span class="op">)</span><span class="op">/</span><span class="fl">162</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">pa_per_game</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We compute the per-plate-appearance rates for players available in 2002 on data from 1997-2001. To avoid small sample artifacts, we filter players with less than 1,000 plate appearances per year. Here is the entire calculation in one line:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-17_fc240d6eede1519f463f927ea5c6ce63">
-<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">players</span> <span class="op">&lt;-</span> <span class="va">Batting</span> <span class="op">|&gt;</span> </span>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-16_30fe6ac288b60e115f3d32976aa0e0a3">
+<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">players</span> <span class="op">&lt;-</span> <span class="va">Batting</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">yearID</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="fl">1997</span><span class="op">:</span><span class="fl">2001</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">playerID</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>pa <span class="op">=</span> <span class="va">BB</span> <span class="op">+</span> <span class="va">AB</span><span class="op">)</span> <span class="op">|&gt;</span></span>
@@ -830,7 +828,7 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 <p>The player-specific predicted runs computed here can be interpreted as the number of runs we predict a team will score if all batters are exactly like that player. The distribution shows that there is wide variability across players:</p>
 <div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/r-hat-hist_15f6ba10466e59079dae81152b560024">
-<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">players</span><span class="op">$</span><span class="va">r_hat</span>, main <span class="op">=</span> <span class="st">"Predicted runs per game"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">players</span><span class="op">$</span><span class="va">r_hat</span>, main <span class="op">=</span> <span class="st">"Predicted runs per game"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="multivariate-regression_files/figure-html/r-hat-hist-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -840,15 +838,15 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 <p>To actually build the team, we will need to know their salaries as well as their defensive position. For this, we use the <code>righ_join</code> function to combine the <code>players</code> data frame we just created with the player information data frame included in some of the other Lahman data tables.</p>
 <p>Start by adding the 2002 salary of each player:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-18_1b931e61e099448beb9f9ebb5eee33a5">
-<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">players</span> <span class="op">&lt;-</span> <span class="va">Salaries</span> <span class="op">|&gt;</span> </span>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-17_2dae3cf2dd6e387616cc9eca928acb75">
+<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">players</span> <span class="op">&lt;-</span> <span class="va">Salaries</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">yearID</span> <span class="op">==</span> <span class="fl">2002</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">playerID</span>, <span class="va">salary</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html">right_join</a></span><span class="op">(</span><span class="va">players</span>, by <span class="op">=</span> <span class="st">"playerID"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Next, we add their defensive position. This is a somewhat complicated task because players play more than one position each year. The <strong>Lahman</strong> package table <code>Appearances</code> tells how many games each player played in each position, so we can pick the position that was most played using <code>which.max</code> on each row. We use <code>apply</code> to do this. However, because some players are traded, they appear more than once on the table, so we first sum their appearances across teams. Here, we pick the one position the player most played using the <code>top_n</code> function. To make sure we only pick one position, in the case of ties, we pick the first row of the resulting data frame. We also remove the <code>OF</code> position which stands for outfielder, a generalization of three positions: left field (LF), center field (CF), and right field (RF). We also remove pitchers since they don’t bat in the league in which the A’s play.</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-19_f9c529354af682a57fe440a1c3f23d23">
-<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">position_names</span> <span class="op">&lt;-</span> </span>
+<p>Next, we add their defensive position. This is a somewhat complicated task because players play more than one position each year. The <strong>Lahman</strong> package table <code>Appearances</code> specifies how many games each player played in each position, allowing us to pick the position that was most played using <code>which.max</code> on each row. We use <code>apply</code> to do this. However, as some players are traded and appear more than once on the table, we first sum their appearances across teams. Here, we pick the one position the player most played using the <code>top_n</code> function. To make sure we only pick one position, in the case of ties, we pick the first row of the resulting data frame. We also remove the <code>OF</code> position which stands for outfielder, a generalization of three positions: left field (LF), center field (CF), and right field (RF). We also remove pitchers since they don’t bat in the league where the A’s play.</p>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-18_568ebc9aefc0902a1d648711579da30e">
+<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">position_names</span> <span class="op">&lt;-</span> </span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/paste.html">paste0</a></span><span class="op">(</span><span class="st">"G_"</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"p"</span>,<span class="st">"c"</span>,<span class="st">"1b"</span>,<span class="st">"2b"</span>,<span class="st">"3b"</span>,<span class="st">"ss"</span>,<span class="st">"lf"</span>,<span class="st">"cf"</span>,<span class="st">"rf"</span>, <span class="st">"dh"</span><span class="op">)</span><span class="op">)</span></span>
 <span></span>
 <span><span class="va">tmp</span> <span class="op">&lt;-</span> <span class="va">Appearances</span> <span class="op">|&gt;</span> </span>
@@ -868,15 +866,15 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">POS</span><span class="op">)</span>  <span class="op">&amp;</span> <span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">salary</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Finally, we add their first and last name:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-20_197369711bae0b72b7d0515bee2dea0e">
-<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">players</span> <span class="op">&lt;-</span> <span class="va">People</span> <span class="op">|&gt;</span></span>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-19_4c7d0a46bbbb2b62297ac7aace11b88d">
+<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">players</span> <span class="op">&lt;-</span> <span class="va">People</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">playerID</span>, <span class="va">nameFirst</span>, <span class="va">nameLast</span>, <span class="va">debut</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>debut <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/as.Date.html">as.Date</a></span><span class="op">(</span><span class="va">debut</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html">right_join</a></span><span class="op">(</span><span class="va">players</span>, by <span class="op">=</span> <span class="st">"playerID"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>If you are a baseball fan, you will recognize the top 10 players:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-21_4e45daf9697ab13b099ac2fbb2488692">
-<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">players</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">nameFirst</span>, <span class="va">nameLast</span>, <span class="va">POS</span>, <span class="va">salary</span>, <span class="va">r_hat</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/arrange.html">arrange</a></span><span class="op">(</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/desc.html">desc</a></span><span class="op">(</span><span class="va">r_hat</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/utils/head.html">head</a></span><span class="op">(</span><span class="fl">10</span><span class="op">)</span> </span>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-20_4b7d37c7d5d4f8279d096c191c2b0345">
+<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">players</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">nameFirst</span>, <span class="va">nameLast</span>, <span class="va">POS</span>, <span class="va">salary</span>, <span class="va">r_hat</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/arrange.html">arrange</a></span><span class="op">(</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/desc.html">desc</a></span><span class="op">(</span><span class="va">r_hat</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/utils/head.html">head</a></span><span class="op">(</span><span class="fl">10</span><span class="op">)</span> </span>
 <span><span class="co">#&gt;    nameFirst nameLast POS   salary r_hat</span></span>
 <span><span class="co">#&gt; 1      Barry    Bonds  LF 15000000  8.05</span></span>
 <span><span class="co">#&gt; 2      Larry   Walker  RF 12666667  7.96</span></span>
@@ -891,7 +889,7 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 <p>On average, players with a higher metric have higher salaries:</p>
 <div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/predicted-runs-vs-salary_7811c2265ba51044a846a0cc121938a1">
-<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">players</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">salary</span>, <span class="va">r_hat</span>, color <span class="op">=</span> <span class="va">POS</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
+<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">players</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">salary</span>, <span class="va">r_hat</span>, color <span class="op">=</span> <span class="va">POS</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/scale_continuous.html">scale_x_log10</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
@@ -901,7 +899,7 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>We can search for good deals by looking at players who produce many more runs than others with similar salaries. We can use this table to decide what players to pick and keep our total salary below the 40 million dollars Billy Beane had to work with. This can be done using what computer scientists call linear programming. This is not something we teach, but here are the position players selected with this approach:</p>
+<p>We can search for good deals by looking at players who generate many more runs than others with similar salaries. We can use this table to decide what players to pick while keeping our total salary below the 40 million dollar budget Billy Beane had to work with. This can be done using what computer scientists call linear programming. This is not something we teach, although here are the position players selected with this approach:</p>
 <div class="cell" data-layout-align="center">
 <div class="cell-output-display">
 <table class="table table-striped table-sm small" data-quarto-postprocess="true">
@@ -980,8 +978,8 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </table>
 </div>
 </div>
-<p>We see that all these players have above average BB and most have above average HR rates, while the same is not true for singles and batting average. Here is a table with statistics standardized across players so that, for example, above average HR hitters have values above 0.</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-23_5cf8dc31e3f03b77ac12edcab94083dd">
+<p>We see that all these players have above average BB and most have above average HR rates, while the same is not true for singles and batting average. Below is a table with statistics standardized across players so that, for example, above average HR hitters have values above 0:</p>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-22_076503b3e41d3d5ee0b958e38acb8244">
 <div class="cell-output-display">
 <table class="table table-striped table-sm small" data-quarto-postprocess="true">
 <thead><tr class="header">
@@ -1089,29 +1087,29 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 </table>
 </div>
 </div>
-</section></section><section id="exercises" class="level2" data-number="14.5"><h2 data-number="14.5" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">14.5</span> Exercises</h2>
-<p>We have shown how BB and singles have similar predictive power for scoring runs. Another way to compare the usefulness of these baseball metrics is by assessing how stable they are across the years. Since we have to pick players based on their previous performances, we will prefer metrics that are more stable. In these exercises, we will compare the stability of singles and BBs.</p>
-<p>1. Before we get started, we want to generate two tables. One for 2002 and another for the average of 1999-2001 seasons. We want to define per plate appearance statistics. Here is how we create the 2017 table. Keeping only players with more than 100 plate appearances.</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-24_95aed0c09e6716590330144d6e048366">
-<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://CRAN.R-project.org/package=Lahman">Lahman</a></span><span class="op">)</span></span>
+</section></section><section id="exercises" class="level2" data-number="15.5"><h2 data-number="15.5" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">15.5</span> Exercises</h2>
+<p>We have shown how BB and singles have similar predictive power for scoring runs. Another way to compare the usefulness of these baseball metrics is by assessing their stability across the years. Since we have to pick players based on their previous performances, we prefer metrics that are more stable. In these exercises, we will compare the stability of singles and BBs.</p>
+<p>1. Before we begin, we want to generate two tables. One for 2002 and another for the average of 1999-2001 seasons. We want to define per plate appearance statistics. Here is how we create the 2017 table, keeping only players with more than 100 plate appearances:</p>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-23_65458e7d7011301f7bb64f0459bbce8b">
+<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://CRAN.R-project.org/package=Lahman">Lahman</a></span><span class="op">)</span></span>
 <span><span class="va">dat</span> <span class="op">&lt;-</span> <span class="va">Batting</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">yearID</span> <span class="op">==</span> <span class="fl">2002</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>pa <span class="op">=</span> <span class="va">AB</span> <span class="op">+</span> <span class="va">BB</span>, </span>
-<span>         singles <span class="op">=</span> <span class="op">(</span><span class="va">H</span> <span class="op">-</span> <span class="va">X2B</span> <span class="op">-</span> <span class="va">X3B</span> <span class="op">-</span> <span class="va">HR</span><span class="op">)</span> <span class="op">/</span> <span class="va">pa</span>, bb <span class="op">=</span> <span class="va">BB</span> <span class="op">/</span> <span class="va">pa</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>         singles <span class="op">=</span> <span class="op">(</span><span class="va">H</span> <span class="op">-</span> <span class="va">X2B</span> <span class="op">-</span> <span class="va">X3B</span> <span class="op">-</span> <span class="va">HR</span><span class="op">)</span><span class="op">/</span><span class="va">pa</span>, bb <span class="op">=</span> <span class="va">BB</span><span class="op">/</span><span class="va">pa</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">pa</span> <span class="op">&gt;=</span> <span class="fl">100</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">playerID</span>, <span class="va">singles</span>, <span class="va">bb</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Now compute a similar table but with rates computed over 1999-2001.</p>
+<p>Now, compute a similar table, but with rates computed over 1999-2001.</p>
 <p>2. You can use the <code>inner_join</code> function to combine the 2001 data and averages in the same table:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-25_eece570403ea70bf5580a3e78834b30c">
-<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html">inner_join</a></span><span class="op">(</span><span class="va">dat</span>, <span class="va">avg</span>, by <span class="op">=</span> <span class="st">"playerID"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-24_2a79b66605abc245971d8ff3beb1dec1">
+<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate-joins.html">inner_join</a></span><span class="op">(</span><span class="va">dat</span>, <span class="va">avg</span>, by <span class="op">=</span> <span class="st">"playerID"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Compute the correlation between 2002 and the previous seasons for singles and BB.</p>
 <p>3. Note that the correlation is higher for BB. To quickly get an idea of the uncertainty associated with this correlation estimate, we will fit a linear model and compute confidence intervals for the slope coefficient. However, first make scatterplots to confirm that fitting a linear model is appropriate.</p>
 <p>4. Now fit a linear model for each metric and use the <code>confint</code> function to compare the estimates.</p>
-<p>5. In a previous section, we computed the correlation between mothers and daughters, mothers and sons, fathers and daughters, and fathers and sons, and noticed that the highest correlation is between fathers and sons and the lowest is between mothers and sons. We can compute these correlations using:</p>
-<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-26_b9367675a8e74bc6c6ac28299be91bbf">
-<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">HistData</span><span class="op">)</span></span>
+<p>5. In a previous section, we computed the correlation between mothers and daughters, mothers and sons, fathers and daughters, and fathers and sons. We noticed that the highest correlation is between fathers and sons and the lowest is between mothers and sons. We can compute these correlations using:</p>
+<div class="cell" data-layout-align="center" data-hash="multivariate-regression_cache/html/unnamed-chunk-25_46ec94782090e423e19be4ff1804ed9d">
+<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">HistData</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1</span><span class="op">)</span></span>
 <span><span class="va">galton_heights</span> <span class="op">&lt;-</span> <span class="va">GaltonFamilies</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">family</span>, <span class="va">gender</span><span class="op">)</span> <span class="op">|&gt;</span></span>
@@ -1137,15 +1135,15 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 <p><span class="math display">\[
 \frac{\mbox{BB}}{\mbox{PA}} + \frac{\mbox{Singles} + 2 \mbox{Doubles} + 3 \mbox{Triples} + 4\mbox{HR}}{\mbox{AB}}
 \]</span></p>
-<p>They called this on-base-percentage plus slugging percentage (OPS). Although the sabermetricians probably did not use regression, here we show how this metric is close to what one gets with regression.</p>
+<p>They called this on-base-percentage plus slugging percentage (OPS). Although the sabermetricians probably did not use regression, here we demonstrate how this metric closely aligns with regression results.</p>
 <p>Compute the OPS for each team in the 2001 season. Then plot Runs per game versus OPS.</p>
-<p>14. For every year since 1962, compute the correlation between runs per game and OPS; then plot these correlations as a function of year.</p>
-<p>15. Note that we can rewrite OPS as a weighted average of BBs, singles, doubles, triples, and HRs. We know that the weights for doubles, triples, and HRs are 2, 3, and 4 times that of singles. But what about BB? What is the weight for BB relative to singles? Hint: the weight for BB relative to singles will be a function of AB and PA.</p>
-<p>16. Note that the weight for BB, <span class="math inline">\(\frac{\mbox{AB}}{\mbox{PA}}\)</span>, will change from team to team. To see how variable it is, compute and plot this quantity for each team for each year since 1962. Then plot it again, but instead of computing it for every team, compute and plot the ratio for the entire year. Then, once you are convinced that there is not much of a time or team trend, report the overall average.</p>
+<p>14. For every year since 1962, compute the correlation between runs per game and OPS. Then plot these correlations as a function of year.</p>
+<p>15. Keep in mind that we can rewrite OPS as a weighted average of BBs, singles, doubles, triples, and HRs. We know that the weights for doubles, triples, and HRs are 2, 3, and 4 times that of singles. But what about BB? What is the weight for BB relative to singles? Hint: the weight for BB relative to singles will be a function of AB and PA.</p>
+<p>16. Consider that the weight for BB, <span class="math inline">\(\frac{\mbox{AB}}{\mbox{PA}}\)</span>, will change from team to team. To assess its variability, compute and plot this quantity for each team for each year since 1962. Then plot it again, but instead of computing it for every team, compute and plot the ratio for the entire year. Then, once you are convinced that there is not much of a time or team trend, report the overall average.</p>
 <p>17. So now we know that the formula for OPS is proportional to <span class="math inline">\(0.91 \times \mbox{BB} + \mbox{singles} + 2 \times \mbox{doubles} + 3 \times \mbox{triples} + 4 \times \mbox{HR}\)</span>. Let’s see how these coefficients compare to those obtained with regression. Fit a regression model to the data after 1962, as done earlier: using per game statistics for each year for each team. After fitting this model, report the coefficients as weights relative to the coefficient for singles.</p>
-<p>18. We see that our linear regression model coefficients follow the same general trend as those used by OPS, but with slightly less weight for metrics other than singles. For each team in years after 1962, compute the OPS, the predicted runs with the regression model and compute the correlation between the two as well as the correlation with runs per game.</p>
-<p>19. We see that using the regression approach predicts runs slightly better than OPS, but not that much. However, note that we have been computing OPS and predicting runs for teams when these measures are used to evaluate players. Let’s show that OPS is quite similar to what one obtains with regression at the player level. For the 1962 season and after, compute the OPS and the predicted runs from our model for each player and plot them. Use the PA per game correction we used in the previous chapter:</p>
-<p>20. What players have show the largest difference between their rank by predicted runs and OPS?</p>
+<p>18. We see that our linear regression model coefficients follow the same general trend as those used by OPS, but with slightly less weight for metrics other than singles. For each team in years after 1962, compute the OPS, the predicted runs with the regression model, and compute the correlation between the two, as well as the correlation with runs per game.</p>
+<p>19. We see that using the regression approach predicts runs slightly better than OPS, but not that much. However, note that we have been computing OPS and predicting runs for teams when these measures are used to evaluate players. Let’s show that OPS is quite similar to what one obtains with regression at the player level. For the 1962 season and onward, compute the OPS and the predicted runs from our model for each player, and plot them. Use the PA per game correction we used in the previous chapter:</p>
+<p>20. Which players have shown the largest difference between their rank by predicted runs and OPS?</p>
 
 
 </section><section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes"><hr>
@@ -1392,12 +1390,12 @@ <h1 class="title"><span id="sec-multivariate-regression" class="quarto-section-i
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../linear-models/regression.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../linear-models/measurement-error-models.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/linear-models/regression.html b/docs/linear-models/regression.html
index 2255a0a..1d7c207 100644
--- a/docs/linear-models/regression.html
+++ b/docs/linear-models/regression.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 13&nbsp; Regression</title>
+<title>Advanced Data Science - 14&nbsp; Regression</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -99,7 +99,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../linear-models/intro-to-linear-models.html">Linear Models</a></li><li class="breadcrumb-item"><a href="../linear-models/regression.html"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../linear-models/intro-to-linear-models.html">Linear Models</a></li><li class="breadcrumb-item"><a href="../linear-models/regression.html"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -224,23 +224,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -257,37 +263,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -304,31 +310,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -345,49 +351,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -400,33 +406,33 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#case-study-is-height-hereditary" id="toc-case-study-is-height-hereditary" class="nav-link active" data-scroll-target="#case-study-is-height-hereditary"><span class="header-section-number">13.1</span> Case study: is height hereditary?</a></li>
+<li><a href="#case-study-is-height-hereditary" id="toc-case-study-is-height-hereditary" class="nav-link active" data-scroll-target="#case-study-is-height-hereditary"><span class="header-section-number">14.1</span> Case study: is height hereditary?</a></li>
   <li>
-<a href="#sec-corr-coef" id="toc-sec-corr-coef" class="nav-link" data-scroll-target="#sec-corr-coef"><span class="header-section-number">13.2</span> The correlation coefficient</a>
+<a href="#sec-corr-coef" id="toc-sec-corr-coef" class="nav-link" data-scroll-target="#sec-corr-coef"><span class="header-section-number">14.2</span> The correlation coefficient</a>
   <ul class="collapse">
-<li><a href="#sample-correlation-is-a-random-variable" id="toc-sample-correlation-is-a-random-variable" class="nav-link" data-scroll-target="#sample-correlation-is-a-random-variable"><span class="header-section-number">13.2.1</span> Sample correlation is a random variable</a></li>
-  <li><a href="#sec-ascombe" id="toc-sec-ascombe" class="nav-link" data-scroll-target="#sec-ascombe"><span class="header-section-number">13.2.2</span> Correlation is not always a useful summary</a></li>
+<li><a href="#sample-correlation-is-a-random-variable" id="toc-sample-correlation-is-a-random-variable" class="nav-link" data-scroll-target="#sample-correlation-is-a-random-variable"><span class="header-section-number">14.2.1</span> Sample correlation is a random variable</a></li>
+  <li><a href="#sec-ascombe" id="toc-sec-ascombe" class="nav-link" data-scroll-target="#sec-ascombe"><span class="header-section-number">14.2.2</span> Correlation is not always a useful summary</a></li>
   </ul>
 </li>
-  <li><a href="#sec-conditional-expectation" id="toc-sec-conditional-expectation" class="nav-link" data-scroll-target="#sec-conditional-expectation"><span class="header-section-number">13.3</span> Conditional expectations</a></li>
-  <li><a href="#the-regression-line" id="toc-the-regression-line" class="nav-link" data-scroll-target="#the-regression-line"><span class="header-section-number">13.4</span> The regression line</a></li>
-  <li><a href="#regression-improves-precision" id="toc-regression-improves-precision" class="nav-link" data-scroll-target="#regression-improves-precision"><span class="header-section-number">13.5</span> Regression improves precision</a></li>
-  <li><a href="#bivariate-normal-distribution" id="toc-bivariate-normal-distribution" class="nav-link" data-scroll-target="#bivariate-normal-distribution"><span class="header-section-number">13.6</span> Bivariate normal distribution</a></li>
-  <li><a href="#variance-explained" id="toc-variance-explained" class="nav-link" data-scroll-target="#variance-explained"><span class="header-section-number">13.7</span> Variance explained</a></li>
-  <li><a href="#there-are-two-regression-lines" id="toc-there-are-two-regression-lines" class="nav-link" data-scroll-target="#there-are-two-regression-lines"><span class="header-section-number">13.8</span> There are two regression lines</a></li>
-  <li><a href="#linear-models" id="toc-linear-models" class="nav-link" data-scroll-target="#linear-models"><span class="header-section-number">13.9</span> Linear models</a></li>
-  <li><a href="#sec-lse" id="toc-sec-lse" class="nav-link" data-scroll-target="#sec-lse"><span class="header-section-number">13.10</span> Least Squares Estimates</a></li>
-  <li><a href="#the-lm-function" id="toc-the-lm-function" class="nav-link" data-scroll-target="#the-lm-function"><span class="header-section-number">13.11</span> The <code>lm</code> function</a></li>
-  <li><a href="#lse-are-random-variables" id="toc-lse-are-random-variables" class="nav-link" data-scroll-target="#lse-are-random-variables"><span class="header-section-number">13.12</span> LSE are random variables</a></li>
-  <li><a href="#predicted-values-are-random-variables" id="toc-predicted-values-are-random-variables" class="nav-link" data-scroll-target="#predicted-values-are-random-variables"><span class="header-section-number">13.13</span> Predicted values are random variables</a></li>
-  <li><a href="#diagnostic-plots" id="toc-diagnostic-plots" class="nav-link" data-scroll-target="#diagnostic-plots"><span class="header-section-number">13.14</span> Diagnostic plots</a></li>
-  <li><a href="#the-regression-fallacy" id="toc-the-regression-fallacy" class="nav-link" data-scroll-target="#the-regression-fallacy"><span class="header-section-number">13.15</span> The regression fallacy</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">13.16</span> Exercises</a></li>
+  <li><a href="#sec-conditional-expectation" id="toc-sec-conditional-expectation" class="nav-link" data-scroll-target="#sec-conditional-expectation"><span class="header-section-number">14.3</span> Conditional expectations</a></li>
+  <li><a href="#the-regression-line" id="toc-the-regression-line" class="nav-link" data-scroll-target="#the-regression-line"><span class="header-section-number">14.4</span> The regression line</a></li>
+  <li><a href="#regression-improves-precision" id="toc-regression-improves-precision" class="nav-link" data-scroll-target="#regression-improves-precision"><span class="header-section-number">14.5</span> Regression improves precision</a></li>
+  <li><a href="#bivariate-normal-distribution" id="toc-bivariate-normal-distribution" class="nav-link" data-scroll-target="#bivariate-normal-distribution"><span class="header-section-number">14.6</span> Bivariate normal distribution</a></li>
+  <li><a href="#variance-explained" id="toc-variance-explained" class="nav-link" data-scroll-target="#variance-explained"><span class="header-section-number">14.7</span> Variance explained</a></li>
+  <li><a href="#there-are-two-regression-lines" id="toc-there-are-two-regression-lines" class="nav-link" data-scroll-target="#there-are-two-regression-lines"><span class="header-section-number">14.8</span> There are two regression lines</a></li>
+  <li><a href="#linear-models" id="toc-linear-models" class="nav-link" data-scroll-target="#linear-models"><span class="header-section-number">14.9</span> Linear models</a></li>
+  <li><a href="#sec-lse" id="toc-sec-lse" class="nav-link" data-scroll-target="#sec-lse"><span class="header-section-number">14.10</span> Least Squares Estimates</a></li>
+  <li><a href="#the-lm-function" id="toc-the-lm-function" class="nav-link" data-scroll-target="#the-lm-function"><span class="header-section-number">14.11</span> The <code>lm</code> function</a></li>
+  <li><a href="#lse-are-random-variables" id="toc-lse-are-random-variables" class="nav-link" data-scroll-target="#lse-are-random-variables"><span class="header-section-number">14.12</span> LSE are random variables</a></li>
+  <li><a href="#predicted-values-are-random-variables" id="toc-predicted-values-are-random-variables" class="nav-link" data-scroll-target="#predicted-values-are-random-variables"><span class="header-section-number">14.13</span> Predicted values are random variables</a></li>
+  <li><a href="#diagnostic-plots" id="toc-diagnostic-plots" class="nav-link" data-scroll-target="#diagnostic-plots"><span class="header-section-number">14.14</span> Diagnostic plots</a></li>
+  <li><a href="#the-regression-fallacy" id="toc-the-regression-fallacy" class="nav-link" data-scroll-target="#the-regression-fallacy"><span class="header-section-number">14.15</span> The regression fallacy</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">14.16</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/linear-models/regression.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
-<h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></h1>
+<h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></h1>
 </div>
 
 
@@ -439,16 +445,16 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
   </div>
   
 
-</header><section id="case-study-is-height-hereditary" class="level2" data-number="13.1"><h2 data-number="13.1" class="anchored" data-anchor-id="case-study-is-height-hereditary">
-<span class="header-section-number">13.1</span> Case study: is height hereditary?</h2>
-<p>To understand the concepts of correlation and simple regression we actually use the dataset from which regression was born. The example is from genetics. Francis Galton<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> studied the variation and heredity of human traits. Among many other traits, Galton collected and studied height data from families to try to understand heredity. While doing this, he developed the concepts of correlation and regression, as well as a connection to pairs of data that follow a normal distribution. Of course, at the time this data was collected our knowledge of genetics was quite limited compared to what we know today. A very specific question Galton tried to answer was: how well can we predict a child’s height based on the parents’ height? The technique he developed to answer this question, regression, can also be applied to our baseball question. Regression can be applied in many other circumstances as well.</p>
+</header><section id="case-study-is-height-hereditary" class="level2" data-number="14.1"><h2 data-number="14.1" class="anchored" data-anchor-id="case-study-is-height-hereditary">
+<span class="header-section-number">14.1</span> Case study: is height hereditary?</h2>
+<p>To understand the concepts of correlation and simple regression, we actually use the dataset from which regression was born. The example is from genetics. Francis Galton<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> studied the variation and heredity of human traits. Among many other traits, Galton collected and studied height data from families to try to understand heredity. While doing this, he developed the concepts of correlation and regression, as well as a connection to pairs of data that follow a normal distribution. Of course, at the time this data was collected, our knowledge of genetics was quite limited compared to what we know today. A very specific question Galton tried to answer was: how well can we predict a child’s height based on the parents’ height? The technique he developed to answer this question, regression, can also be applied to our baseball question, as well as many other circumstances.</p>
 <div class="callout callout-style-simple callout-note">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>Galton made important contributions to statistics and genetics, but he was also one of the first proponents of eugenics, a scientifically flawed philosophical movement favored by many biologists of Galton’s time but with horrific historical consequences. You can read more about it here: <a href="https://pged.org/history-eugenics-and-genetics/" class="uri">https://pged.org/history-eugenics-and-genetics/</a>.</p>
+<p>Galton made important contributions to statistics and genetics, but he was also one of the first proponents of Eugenics, a scientifically flawed philosophical movement favored by many biologists of Galton’s time, but with horrific historical consequences. You can read more about it here: <a href="https://pged.org/history-eugenics-and-genetics/" class="uri">https://pged.org/history-eugenics-and-genetics/</a>.</p>
 </div>
 </div>
 </div>
@@ -487,18 +493,19 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 </div>
 <p>We will learn that the correlation coefficient is an informative summary of how two variables move together and then motivate simple regression by noting how this can be used to predict one variable using the other.</p>
-</section><section id="sec-corr-coef" class="level2" data-number="13.2"><h2 data-number="13.2" class="anchored" data-anchor-id="sec-corr-coef">
-<span class="header-section-number">13.2</span> The correlation coefficient</h2>
+</section><section id="sec-corr-coef" class="level2" data-number="14.2"><h2 data-number="14.2" class="anchored" data-anchor-id="sec-corr-coef">
+<span class="header-section-number">14.2</span> The correlation coefficient</h2>
 <p>The correlation coefficient is defined for a list of pairs <span class="math inline">\((x_1, y_1), \dots, (x_n,y_n)\)</span> as the average of the product of the standardized values:</p>
 <p><span class="math display">\[
 \rho = \frac{1}{n} \sum_{i=1}^n \left( \frac{x_i-\mu_x}{\sigma_x} \right)\left( \frac{y_i-\mu_y}{\sigma_y} \right)
 \]</span></p>
-<p>with <span class="math inline">\(\mu_x, \mu_y\)</span> the averages of <span class="math inline">\(x_1,\dots, x_n\)</span> and <span class="math inline">\(y_1, \dots, y_n\)</span>, respectively, and <span class="math inline">\(\sigma_x, \sigma_y\)</span> the standard deviations. The Greek letter <span class="math inline">\(\rho\)</span> is commonly used in statistics books to denote the correlation. The Greek letter for <span class="math inline">\(r\)</span>, <span class="math inline">\(\rho\)</span>, because it is the first letter of regression. Soon we learn about the connection between correlation and regression. We can represent the formula above with R code using:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-4_248ea9971d8625d4a556824e91071556">
+<p>with <span class="math inline">\(\mu_x, \mu_y\)</span> the averages of <span class="math inline">\(x_1,\dots, x_n\)</span> and <span class="math inline">\(y_1, \dots, y_n\)</span>, respectively, and <span class="math inline">\(\sigma_x, \sigma_y\)</span> the standard deviations. The Greek letter for <span class="math inline">\(r\)</span>, <span class="math inline">\(\rho\)</span> is commonly used in statistics books to denote the correlation. It is not a coincidence that <span class="math inline">\(r\)</span> is the first letter in “regression”. Soon we learn about the connection between correlation and regression.</p>
+<p>We can represent the formula above with R code using:</p>
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-3_7eb8ebd339b3b605ff6f74372ed0f604">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">rho</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/scale.html">scale</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/scale.html">scale</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To understand why this equation does in fact summarize how two variables move together, consider the <span class="math inline">\(i\)</span>-th entry of <span class="math inline">\(x\)</span> is <span class="math inline">\(\left( \frac{x_i-\mu_x}{\sigma_x} \right)\)</span> SDs away from the average. Similarly, the <span class="math inline">\(y_i\)</span> that is paired with <span class="math inline">\(x_i\)</span>, is <span class="math inline">\(\left( \frac{y_1-\mu_y}{\sigma_y} \right)\)</span> SDs away from the average <span class="math inline">\(y\)</span>. If <span class="math inline">\(x\)</span> and <span class="math inline">\(y\)</span> are unrelated, the product <span class="math inline">\(\left( \frac{x_i-\mu_x}{\sigma_x} \right)\left( \frac{y_i-\mu_y}{\sigma_y} \right)\)</span> will be positive ( <span class="math inline">\(+ \times +\)</span> and <span class="math inline">\(- \times -\)</span> ) as often as negative (<span class="math inline">\(+ \times -\)</span> and <span class="math inline">\(- \times +\)</span>) and will average out to about 0. This correlation is the average and therefore unrelated variables will have 0 correlation. If instead the quantities vary together, then we are averaging mostly positive products ( <span class="math inline">\(+ \times +\)</span> and <span class="math inline">\(- \times -\)</span>) and we get a positive correlation. If they vary in opposite directions, we get a negative correlation.</p>
-<p>The correlation coefficient is always between -1 and 1. We can show this mathematically: consider that we can’t have higher correlation than when we compare a list to itself (perfect correlation) and in this case the correlation is:</p>
+<p>To understand why this equation does in fact summarize how two variables move together, consider the <span class="math inline">\(i\)</span>-th entry of <span class="math inline">\(x\)</span> is <span class="math inline">\(\left( \frac{x_i-\mu_x}{\sigma_x} \right)\)</span> SDs away from the average. Similarly, the <span class="math inline">\(y_i\)</span> that is paired with <span class="math inline">\(x_i\)</span>, is <span class="math inline">\(\left( \frac{y_1-\mu_y}{\sigma_y} \right)\)</span> SDs away from the average <span class="math inline">\(y\)</span>. If <span class="math inline">\(x\)</span> and <span class="math inline">\(y\)</span> are unrelated, the product <span class="math inline">\(\left( \frac{x_i-\mu_x}{\sigma_x} \right)\left( \frac{y_i-\mu_y}{\sigma_y} \right)\)</span> will be positive ( <span class="math inline">\(+ \times +\)</span> and <span class="math inline">\(- \times -\)</span> ) as often as negative (<span class="math inline">\(+ \times -\)</span> and <span class="math inline">\(- \times +\)</span>) and will average out to about 0. This correlation is the average and therefore unrelated variables will have 0 correlation. If instead the quantities vary together, then we are averaging mostly positive products (<span class="math inline">\(+ \times +\)</span> and <span class="math inline">\(- \times -\)</span>) and we get a positive correlation. If they vary in opposite directions, we get a negative correlation.</p>
+<p>The correlation coefficient is always between -1 and 1. We can show this mathematically: consider that we can’t have higher correlation than when we compare a list to itself (perfect correlation) and, in this case, the correlation is:</p>
 <p><span class="math display">\[
 \rho = \frac{1}{n} \sum_{i=1}^n \left( \frac{x_i-\mu_x}{\sigma_x} \right)^2 =
 \frac{1}{\sigma_x^2} \frac{1}{n} \sum_{i=1}^n \left( x_i-\mu_x \right)^2 =
@@ -506,13 +513,13 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 1
 \]</span></p>
 <p>A similar derivation, but with <span class="math inline">\(x\)</span> and its exact opposite, proves the correlation has to be bigger or equal to -1.</p>
-<p>For other pairs, the correlation is in between -1 and 1. The correlation, computed with the function <code>cor</code>, between father and son’s heights is about 0.5:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-5_2687ce35c74f2f1ab7683837c50c9e78">
+<p>For other pairs, the correlation is between -1 and 1. The correlation, computed with the function <code>cor</code>, between father and son’s heights is about 0.5:</p>
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-4_a71b43911df99c91150c843a508a95d3">
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">galton_heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>r <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">father</span>, <span class="va">son</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">r</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.433</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <div class="{callout-warning}">
-<p>For reasons similar to those explained in Section <a href="../inference/models.html#sec-population-sd"><span>Section&nbsp;10.2.1</span></a> for the standard deviation, <code>cor(x,y)</code> divides by <code>length(x)-1</code> rather than <code>length(x)</code>.</p>
+<p>The function <code>cor(x, y)</code> computes the sample correlation, which divides the sum of products by <code>length(x)-1</code> rather than <code>length(x)</code>. The rationale for this is akin to the reason we divide by <code>length(x)-1</code> when computing the sample standard deviation <code>sd(x)</code>. Namely, this adjustment helps account for the degrees of freedom in the sample, which is necessary for unbiased estimates.</p>
 </div>
 <p>To see what data looks like for different values of <span class="math inline">\(\rho\)</span>, here are six examples of pairs with correlations ranging from -0.9 to 0.99:</p>
 <div class="cell" data-layout-align="center" data-hash="regression_cache/html/what-correlation-looks-like_b6eb78d9a956b179c3e6b49ad250f824">
@@ -523,12 +530,12 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 </div>
 </div>
-<section id="sample-correlation-is-a-random-variable" class="level3" data-number="13.2.1"><h3 data-number="13.2.1" class="anchored" data-anchor-id="sample-correlation-is-a-random-variable">
-<span class="header-section-number">13.2.1</span> Sample correlation is a random variable</h3>
+<section id="sample-correlation-is-a-random-variable" class="level3" data-number="14.2.1"><h3 data-number="14.2.1" class="anchored" data-anchor-id="sample-correlation-is-a-random-variable">
+<span class="header-section-number">14.2.1</span> Sample correlation is a random variable</h3>
 <p>Before we continue connecting correlation to regression, let’s remind ourselves about random variability.</p>
-<p>In most data science applications, we observe data that includes random variation. For example, in many cases, we do not observe data for the entire population of interest but rather for a random sample. As with the average and standard deviation, the <em>sample correlation</em> is the most commonly used estimate of the population correlation. This implies that the correlation we compute and use as a summary is a random variable.</p>
+<p>In most data science applications, we observe data that includes random variation. For example, in many cases, we do not observe data for the entire population of interest, but rather for a random sample. As with the average and standard deviation, the <em>sample correlation</em> is the most commonly used estimate of the population correlation. This implies that the correlation we compute and use as a summary is a random variable.</p>
 <p>By way of illustration, let’s assume that the 179 pairs of fathers and sons is our entire population. A less fortunate geneticist can only afford measurements from a random sample of 25 pairs. The sample correlation can be computed with:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-6_63a6dd892db49e539a820c013f87203f">
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-5_733b22a09048e284e079bffa41e89a13">
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">R</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/sample_n.html">sample_n</a></span><span class="op">(</span><span class="va">galton_heights</span>, <span class="fl">25</span>, replace <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>r <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">father</span>, <span class="va">son</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">r</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
@@ -550,12 +557,12 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 </div>
 <p>We see that the expected value of <code>R</code> is the population correlation:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-7_0460784b79a931da1a3656396b4249a4">
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-6_7096320ca7b45678c1e02a2b4ec66f72">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">R</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.431</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>and that it has a relatively high standard error relative to the range of values <code>R</code> can take:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-8_48812a9d73a9d0d795513fcad9254787">
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-7_1fb3ebef5626f9dc09ff11ff4c193c75">
 <div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">R</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.161</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
@@ -574,8 +581,8 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 </div>
 <p>If you increase <span class="math inline">\(N\)</span>, you will see the distribution converging to normal.</p>
-</section><section id="sec-ascombe" class="level3" data-number="13.2.2"><h3 data-number="13.2.2" class="anchored" data-anchor-id="sec-ascombe">
-<span class="header-section-number">13.2.2</span> Correlation is not always a useful summary</h3>
+</section><section id="sec-ascombe" class="level3" data-number="14.2.2"><h3 data-number="14.2.2" class="anchored" data-anchor-id="sec-ascombe">
+<span class="header-section-number">14.2.2</span> Correlation is not always a useful summary</h3>
 <p>Correlation is not always a good summary of the relationship between two variables. The following four artificial datasets, referred to as Anscombe’s quartet, famously illustrate this point. All these pairs have a correlation of 0.82:</p>
 <div class="cell" data-layout-align="center" data-hash="regression_cache/html/ascombe-quartet_c262164775c9197e7250427fb5ddc723">
 <pre><code>#&gt; `geom_smooth()` using formula = 'y ~ x'</code></pre>
@@ -586,32 +593,32 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 </div>
 </div>
-<p>Correlation is only meaningful in a particular context. To help us understand when it is that correlation is meaningful as a summary statistic, we will return to the example of predicting a son’s height using his father’s height. This will help motivate and define linear regression. We start by demonstrating how correlation can be useful for prediction.</p>
-</section></section><section id="sec-conditional-expectation" class="level2" data-number="13.3"><h2 data-number="13.3" class="anchored" data-anchor-id="sec-conditional-expectation">
-<span class="header-section-number">13.3</span> Conditional expectations</h2>
+<p>Correlation is only meaningful in a particular context. To help us understand when correlation is meaningful as a summary statistic, we return to the example of predicting a son’s height using his father’s height. This will help motivate and define linear regression. We start by demonstrating how correlation can be useful for prediction.</p>
+</section></section><section id="sec-conditional-expectation" class="level2" data-number="14.3"><h2 data-number="14.3" class="anchored" data-anchor-id="sec-conditional-expectation">
+<span class="header-section-number">14.3</span> Conditional expectations</h2>
 <p>Suppose we are asked to guess the height of a randomly selected son and we don’t know his father’s height. Because the distribution of sons’ heights is approximately normal, we know the average height, 69.2, is the value with the highest proportion and would be the prediction with the highest chance of minimizing the error. But what if we are told that the father is taller than average, say 72 inches tall, do we still guess 69.2 for the son?</p>
-<p>It turns out that if we were able to collect data from a very large number of fathers that are 72 inches, the distribution of their sons’ heights would be normally distributed. This implies that the average of the distribution computed on this subset would be our best prediction.</p>
-<p>In general, we call this approach <em>conditioning</em>. The general idea is that we stratify a population into groups and compute summaries in each group. To provide a mathematical description of conditioning, consider we have a population of pairs of values <span class="math inline">\((x_1,y_1),\dots,(x_n,y_n)\)</span>, for example all father and son heights in England. In the previous chapter we learned that if you take a random pair <span class="math inline">\((X,Y)\)</span>, the expected value and best predictor of <span class="math inline">\(Y\)</span> is <span class="math inline">\(\mbox{E}(Y) = \mu_y\)</span>, the population average <span class="math inline">\(1/n\sum_{i=1}^n y_i\)</span>. However, we are no longer interested in the general population, instead we are interested in only the subset of a population with a specific <span class="math inline">\(x_i\)</span> value, 72 inches in our example. This subset of the population, is also a population and thus the same principles and properties we have learned apply. The <span class="math inline">\(y_i\)</span> in the subpopulation have a distribution, referred to as the <em>conditional distribution</em>, and this distribution has an expected value referred to as the <em>conditional expectation</em>. In our example, the conditional expectation is the average height of all sons in England with fathers that are 72 inches. The statistical notation for the conditional expectation is</p>
+<p>It turns out that, if we were able to collect data from a very large number of fathers that are 72 inches, the distribution of their sons’ heights would be normally distributed. This implies that the average of the distribution computed on this subset would be our best prediction.</p>
+<p>In general, we call this approach <em>conditioning</em>. The general idea is that we stratify a population into groups and compute summaries in each group. To provide a mathematical description of conditioning, consider that we have a population of pairs of values <span class="math inline">\((x_1,y_1),\dots,(x_n,y_n)\)</span>, for example all father and son heights in England. In the previous chapter, we learned that if you take a random pair <span class="math inline">\((X,Y)\)</span>, the expected value and best predictor of <span class="math inline">\(Y\)</span> is <span class="math inline">\(\mbox{E}(Y) = \mu_y\)</span>, the population average <span class="math inline">\(1/n\sum_{i=1}^n y_i\)</span>. However, we are no longer interested in the general population. Instead, we are interested in only the subset of a population with a specific <span class="math inline">\(x_i\)</span> value, 72 inches in our example. This subset of the population is also a population, and thus, the same principles and properties we have learned apply. The <span class="math inline">\(y_i\)</span> in the subpopulation have a distribution, referred to as the <em>conditional distribution</em>, and this distribution has an expected value referred to as the <em>conditional expectation</em>. In our example, the conditional expectation is the average height of all sons in England with fathers that are 72 inches. The statistical notation for the conditional expectation is:</p>
 <p><span class="math display">\[
 \mbox{E}(Y \mid X = x)
 \]</span></p>
-<p>with <span class="math inline">\(x\)</span> representing the fixed value that defines that subset, for example 72 inches. Similarly, we denote the standard deviation of the strata with</p>
+<p>with <span class="math inline">\(x\)</span> representing the fixed value that defines that subset, for example 72 inches. Similarly, we denote the standard deviation of the strata with:</p>
 <p><span class="math display">\[
 \mbox{SD}(Y \mid X = x) = \sqrt{\mbox{Var}(Y \mid X = x)}
 \]</span></p>
 <p>Because the conditional expectation <span class="math inline">\(E(Y\mid X=x)\)</span> is the best predictor for the random variable <span class="math inline">\(Y\)</span> for an individual in the strata defined by <span class="math inline">\(X=x\)</span>, many data science challenges reduce to estimating this quantity. The conditional standard deviation quantifies the precision of the prediction.</p>
-<p>In the example we have been considering, we are interested in computing the average son height <em>conditioned</em> on the father being 72 inches tall. We want to estimate <span class="math inline">\(E(Y|X=72)\)</span> using the sample collected by Galton. We previously learned that the sample average is the preferred approach to estimating the population average. However, a challenge when using this approach to estimating conditional expectations is that for continuous data we don’t have many data points matching exactly one value in our sample. For example, we have only:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-9_4461f40d2e8771a23bc6f6c46fd2a121">
+<p>In the example we have been considering, we are interested in computing the average son height <em>conditioned</em> on the father being 72 inches tall. We want to estimate <span class="math inline">\(E(Y|X=72)\)</span> using the sample collected by Galton. We previously learned that the sample average is the preferred approach to estimating the population average. However, a challenge when using this approach to estimating conditional expectations is that, for continuous data, we don’t have many data points matching exactly one value in our sample. For example, we have only:</p>
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-8_7c67b9f866edc9e944096bfb1ecff34d">
 <div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">galton_heights</span><span class="op">$</span><span class="va">father</span> <span class="op">==</span> <span class="fl">72</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 8</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>fathers that are exactly 72-inches. If we change the number to 72.5, we get even fewer data points:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-10_354913aad0335271cebea9e7f80e11f0">
+<p>fathers that are exactly 72 inches. If we change the number to 72.5, we get even fewer data points:</p>
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-9_f7dac8916cd815f50b5b7f1e242852ee">
 <div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">galton_heights</span><span class="op">$</span><span class="va">father</span> <span class="op">==</span> <span class="fl">72.5</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>A practical way to improve these estimates of the conditional expectations, is to define strata of with similar values of <span class="math inline">\(x\)</span>. In our example, we can round father heights to the nearest inch and assume that they are all 72 inches. If we do this, we end up with the following prediction for the son of a father that is 72 inches tall:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-11_a79d5c297a54e4da25bb1812d881d4d5">
+<p>A practical way to improve these estimates of the conditional expectations is to define strata of observations with similar value of <span class="math inline">\(x\)</span>. In our example, we can round father heights to the nearest inch and assume that they are all 72 inches. If we do this, we end up with the following prediction for the son of a father that is 72 inches tall:</p>
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-10_2794d164d0707e617008778e4ac8107c">
 <div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">conditional_avg</span> <span class="op">&lt;-</span> <span class="va">galton_heights</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">father</span><span class="op">)</span> <span class="op">==</span> <span class="fl">72</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>avg <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">son</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
@@ -619,8 +626,8 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 <span><span class="va">conditional_avg</span></span>
 <span><span class="co">#&gt; [1] 70.5</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Note that a 72-inch father is taller than average – specifically, (72.0 - 69.1)/2.5 = 1.1 standard deviations taller than the average father. Our prediction 70.5 is also taller than average, but only 0.49 standard deviations larger than the average son. The sons of 72-inch fathers have <em>regressed</em> some to the average height. We notice that the reduction in how many SDs taller is about 0.5, which happens to be the correlation. As we will see in a later section, this is not a coincidence.</p>
-<p>If we want to make a prediction of any height, not just 72, we could apply the same approach to each strata. Stratification followed by boxplots lets us see the distribution of each group:</p>
+<p>Note that a 72 inch father is taller than average, specifically (72.0 - 69.1)/2.5 = 1.1 standard deviations taller than the average father. Our prediction 70.5 is also taller than average, but only 0.49 standard deviations larger than the average son. The sons of 72 inch fathers have <em>regressed</em> some to the average height. We notice that the reduction in how many SDs taller is about 0.5, which happens to be the correlation. As we will see in a later section, this is not a coincidence.</p>
+<p>If we want to make a prediction of any height, not just 72 inches, we could apply the same approach to each strata. Stratification followed by boxplots lets us see the distribution of each group:</p>
 <div class="cell" data-layout-align="center" data-hash="regression_cache/html/boxplot-1_2b3767657043c56f49b1cb2f287660a3">
 <div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">galton_heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>father_strata <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">father</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">father_strata</span>, <span class="va">son</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
@@ -633,7 +640,7 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 </div>
 </div>
-<p>Not surprisingly, the centers of the groups are increasing with height. Furthermore, these centers appear to follow a linear relationship. Below we plot the averages of each group. If we take into account that these averages are random variables with standard errors, the data is consistent with these points following a straight line:</p>
+<p>Not surprisingly, the centers of the groups are increasing with height. Furthermore, these centers appear to follow a linear relationship. Below, we plot the averages of each group. If we take into account that these averages are random variables with standard errors, the data is consistent with these points following a straight line:</p>
 <div class="cell" data-layout-align="center" data-hash="regression_cache/html/conditional-averages-follow-line_202c1850d52275c54f667203080211e2">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -642,9 +649,9 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 </div>
 </div>
-<p>The fact that these conditional averages follow a line is not a coincidence. In the next section, we explain that the line these averages follow is what we call the <em>regression line</em>, which improves the precision of our estimates. However, it is not always appropriate to estimate conditional expectations with the regression line so we also describe Galton’s theoretical justification for using the regression line.</p>
-</section><section id="the-regression-line" class="level2" data-number="13.4"><h2 data-number="13.4" class="anchored" data-anchor-id="the-regression-line">
-<span class="header-section-number">13.4</span> The regression line</h2>
+<p>The fact that these conditional averages follow a line is not a coincidence. In the next section, we explain that the line these averages follow is what we call the <em>regression line</em>, which improves the precision of our estimates. However, it is not always appropriate to estimate conditional expectations with the regression line, so we also describe Galton’s theoretical justification for using the regression line.</p>
+</section><section id="the-regression-line" class="level2" data-number="14.4"><h2 data-number="14.4" class="anchored" data-anchor-id="the-regression-line">
+<span class="header-section-number">14.4</span> The regression line</h2>
 <p>If we are predicting a random variable <span class="math inline">\(Y\)</span> knowing the value of another <span class="math inline">\(X=x\)</span> using a regression line, then we predict that for every standard deviation, <span class="math inline">\(\sigma_X\)</span>, that <span class="math inline">\(x\)</span> increases above the average <span class="math inline">\(\mu_X\)</span>, our prediction <span class="math inline">\(\hat{Y}\)</span> increase <span class="math inline">\(\rho\)</span> standard deviations <span class="math inline">\(\sigma_Y\)</span> above the average <span class="math inline">\(\mu_Y\)</span> with <span class="math inline">\(\rho\)</span> the correlation between <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span>. The formula for the regression is therefore:</p>
 <p><span class="math display">\[
 \left( \frac{\hat{Y}-\mu_Y}{\sigma_Y} \right) = \rho \left( \frac{x-\mu_X}{\sigma_X} \right)
@@ -684,15 +691,15 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span>alpha <span class="op">=</span> <span class="fl">0.5</span><span class="op">)</span> <span class="op">+</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_abline.html">geom_abline</a></span><span class="op">(</span>intercept <span class="op">=</span> <span class="fl">0</span>, slope <span class="op">=</span> <span class="va">r</span><span class="op">)</span> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="regression-improves-precision" class="level2" data-number="13.5"><h2 data-number="13.5" class="anchored" data-anchor-id="regression-improves-precision">
-<span class="header-section-number">13.5</span> Regression improves precision</h2>
+</section><section id="regression-improves-precision" class="level2" data-number="14.5"><h2 data-number="14.5" class="anchored" data-anchor-id="regression-improves-precision">
+<span class="header-section-number">14.5</span> Regression improves precision</h2>
 <p>Let’s compare the two approaches to prediction that we have presented:</p>
 <ol type="1">
 <li>Round fathers’ heights to closest inch, stratify, and then take the average.</li>
 <li>Compute the regression line and use it to predict.</li>
 </ol>
 <p>We use a Monte Carlo simulation sampling <span class="math inline">\(N=50\)</span> families:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-12_d657784791c0561b8c1c40197c0fd43a">
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-11_07d8fb0a00ec4d0f72f84c259283bbeb">
 <div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
 <span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">50</span></span>
 <span></span>
@@ -715,14 +722,14 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 <span><span class="op">}</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Although the expected value of these two random variables is about the same:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-13_df19accfcf3b5d26a4f0a46220a205d2">
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-12_ffc9cfc8d943d2ddd989ff446d5b235f">
 <div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">conditional_avg</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 70.5</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">regression_prediction</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 70.5</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The standard error for the regression prediction is substantially smaller:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-14_654f41d12e14ad2f53fed563324404dd">
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-13_5067e258d45da9d2c086c5f3a3d9544f">
 <div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">conditional_avg</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.964</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">regression_prediction</span><span class="op">)</span></span>
@@ -730,16 +737,13 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 <p>The regression line is therefore much more stable than the conditional mean. There is an intuitive reason for this. The conditional average is computed on a relatively small subset: the fathers that are about 72 inches tall. In fact, in some of the permutations we have no data, which is why we use <code>na.rm=TRUE</code>. The regression always uses all the data.</p>
 <p>So why not always use the regression for prediction? Because it is not always appropriate. For example, Anscombe provided cases for which the data does not have a linear relationship. So are we justified in using the regression line to predict? Galton answered this in the positive for height data. The justification, which we include in the next section, is somewhat more advanced than the rest of the chapter.</p>
-</section><section id="bivariate-normal-distribution" class="level2" data-number="13.6"><h2 data-number="13.6" class="anchored" data-anchor-id="bivariate-normal-distribution">
-<span class="header-section-number">13.6</span> Bivariate normal distribution</h2>
-<p>Correlation and the regression slope are a widely used summary statistic, but they are often misused or misinterpreted. Anscombe’s examples provide over-simplified cases of dataset in which summarizing with correlation would be a mistake. But there are many more real-life examples.</p>
-<p>The main way we motivate the use of correlation involves what is called the <em>bivariate normal distribution</em>.</p>
-<p>When a pair of random variables is approximated by the bivariate normal distribution, scatterplots look like ovals. As we saw in Section <a href="#sec-corr-coef"><span>Section&nbsp;13.2</span></a>), they can be thin (high correlation) or circle-shaped (no correlation.</p>
-<p>A more technical way to define the bivariate normal distribution is the following: if <span class="math inline">\(X\)</span> is a normally distributed random variable, <span class="math inline">\(Y\)</span> is also a normally distributed random variable, and the conditional distribution of <span class="math inline">\(Y\)</span> for any <span class="math inline">\(X=x\)</span> is approximately normal, then the pair is approximately bivariate normal. When three or more variables have the property that each pair is bivariate normal, we say the variables follow a <em>multivariate</em> normal distribution or that they are <em>jointly normal</em>.</p>
-<ul>
-<li>or simply that the variables are <em>jointly normal</em>
-</li>
-</ul>
+</section><section id="bivariate-normal-distribution" class="level2" data-number="14.6"><h2 data-number="14.6" class="anchored" data-anchor-id="bivariate-normal-distribution">
+<span class="header-section-number">14.6</span> Bivariate normal distribution</h2>
+<p>Correlation and the regression slope are a widely used summary statistic, but they are often misused or misinterpreted. Anscombe’s examples provide over-simplified cases in which the correlation is not a useful summary. But there are many real-life examples.</p>
+<p>The main way we motivate appropriate use of correlation as a summary, involves the <em>bivariate normal distribution</em>.</p>
+<p>When a pair of random variables is approximated by the bivariate normal distribution, scatterplots look like ovals. As we saw in <a href="#sec-corr-coef"><span>Section&nbsp;14.2</span></a>, they can be thin (high correlation) or circle-shaped (no correlation).</p>
+<p>A more technical way to define the bivariate normal distribution is the following: if <span class="math inline">\(X\)</span> is a normally distributed random variable, <span class="math inline">\(Y\)</span> is also a normally distributed random variable, and the conditional distribution of <span class="math inline">\(Y\)</span> for any <span class="math inline">\(X=x\)</span> is approximately normal, then the pair is approximately bivariate normal.</p>
+<p>When three or more variables have the property that each pair is bivariate normal, we say the variables follow a <em>multivariate</em> normal distribution or that they are <em>jointly</em> normal.</p>
 <p>If we think the height data is well approximated by the bivariate normal distribution, then we should see the normal approximation hold for each strata. Here we stratify the son heights by the standardized father heights and see that the assumption appears to hold:</p>
 <div class="cell" data-layout-align="center" data-hash="regression_cache/html/qqnorm-of-strata_02e0cda2f7f22390e25866aaec25520f">
 <div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">galton_heights</span> <span class="op">|&gt;</span></span>
@@ -765,20 +769,20 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 \]</span></p>
 <p>This implies that, if our data is approximately bivariate, the regression line gives the conditional probability. Therefore, we can obtain a much more stable estimate of the conditional expectation by finding the regression line and using it to predict.</p>
 <p>In summary, if our data is approximately bivariate, then the conditional expectation, the best prediction of <span class="math inline">\(Y\)</span> given we know the value of <span class="math inline">\(X\)</span>, is given by the regression line.</p>
-</section><section id="variance-explained" class="level2" data-number="13.7"><h2 data-number="13.7" class="anchored" data-anchor-id="variance-explained">
-<span class="header-section-number">13.7</span> Variance explained</h2>
+</section><section id="variance-explained" class="level2" data-number="14.7"><h2 data-number="14.7" class="anchored" data-anchor-id="variance-explained">
+<span class="header-section-number">14.7</span> Variance explained</h2>
 <p>The bivariate normal theory also tells us that the standard deviation of the <em>conditional</em> distribution described above is:</p>
 <p><span class="math display">\[
 \mbox{SD}(Y \mid X=x ) = \sigma_Y \sqrt{1-\rho^2}
 \]</span></p>
-<p>To see why this is intuitive, notice that without conditioning, <span class="math inline">\(\mbox{SD}(Y) = \sigma_Y\)</span>, we are looking at the variability of all the sons. But once we condition, we are only looking at the variability of the sons with a tall, 72-inch, father. This group will all tend to be somewhat tall so the standard deviation is reduced.</p>
+<p>To see why this is intuitive, notice that without conditioning, <span class="math inline">\(\mbox{SD}(Y) = \sigma_Y\)</span>, we are looking at the variability of all the sons. But once we condition, we are only looking at the variability of the sons with a tall, 72 inch father. This group will all tend to be somewhat tall so the standard deviation is reduced.</p>
 <p>Specifically, it is reduced to <span class="math inline">\(\sqrt{1-\rho^2} = \sqrt{1 - 0.25}\)</span> = 0.87 of what it was originally. We could say that father heights “explain” 13% of the variability observed in son heights.</p>
 <p>The statement “<span class="math inline">\(X\)</span> explains such and such percent of the variability” is commonly used in academic papers. In this case, this percent actually refers to the variance (the SD squared). So if the data is bivariate normal, the variance is reduced by <span class="math inline">\(1-\rho^2\)</span>, so we say that <span class="math inline">\(X\)</span> explains <span class="math inline">\(1- (1-\rho^2)=\rho^2\)</span> (the correlation squared) of the variance.</p>
 <p>But it is important to remember that the “variance explained” statement only makes sense when the data is approximated by a bivariate normal distribution.</p>
-</section><section id="there-are-two-regression-lines" class="level2" data-number="13.8"><h2 data-number="13.8" class="anchored" data-anchor-id="there-are-two-regression-lines">
-<span class="header-section-number">13.8</span> There are two regression lines</h2>
+</section><section id="there-are-two-regression-lines" class="level2" data-number="14.8"><h2 data-number="14.8" class="anchored" data-anchor-id="there-are-two-regression-lines">
+<span class="header-section-number">14.8</span> There are two regression lines</h2>
 <p>We computed a regression line to predict the son’s height from father’s height. We used these calculations:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-15_b5d4e388a1dda3d38d496ecdf8f150c1">
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-14_63fefa43c605ecda202d30169b7097af">
 <div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mu_x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">galton_heights</span><span class="op">$</span><span class="va">father</span><span class="op">)</span></span>
 <span><span class="va">mu_y</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">galton_heights</span><span class="op">$</span><span class="va">son</span><span class="op">)</span></span>
 <span><span class="va">s_x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">galton_heights</span><span class="op">$</span><span class="va">father</span><span class="op">)</span></span>
@@ -789,13 +793,13 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 <p>which gives us the function <span class="math inline">\(\mbox{E}(Y\mid X=x) =\)</span> 37.3 + 0.46 <span class="math inline">\(x\)</span>.</p>
 <p>What if we want to predict the father’s height based on the son’s? It is important to know that this is not determined by computing the inverse function: <span class="math inline">\(x = \{ \mbox{E}(Y\mid X=x) -\)</span> 37.3 <span class="math inline">\(\} /\)</span> 0.5.</p>
-<p>We need to compute <span class="math inline">\(\mbox{E}(X \mid Y=y)\)</span>. Since the data is approximately bivariate normal, the theory described above tells us that this conditional expectation will follow a line with slope and intercept:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-16_a5aae5ac36b81199f776dbb5a9ec0771">
+<p>We need to compute <span class="math inline">\(\mbox{E}(X \mid Y=y)\)</span>. Since the data is approximately bivariate normal, the theory described earlier tells us that this conditional expectation will follow a line with slope and intercept:</p>
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-15_a7f2c93ee0537291101535207468bb25">
 <div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">m_2</span> <span class="op">&lt;-</span>  <span class="va">r</span> <span class="op">*</span> <span class="va">s_x</span> <span class="op">/</span> <span class="va">s_y</span></span>
 <span><span class="va">b_2</span> <span class="op">&lt;-</span> <span class="va">mu_x</span> <span class="op">-</span> <span class="va">m_2</span> <span class="op">*</span> <span class="va">mu_y</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>So we get <span class="math inline">\(\mbox{E}(X \mid Y=y) =\)</span> 40.9 + 0.41y. Again we see regression to the average: the prediction for the father is closer to the father average than the son heights <span class="math inline">\(y\)</span> is to the son average.</p>
-<p>Here is a plot showing the two regression lines, with blue for the predicting son heights with father heights and red for predicting father heights with son heights:</p>
+<p>So we get <span class="math inline">\(\mbox{E}(X \mid Y=y) =\)</span> 40.9 + 0.41y. Again, we see regression to the average: the prediction for the father is closer to the father average than the son heights <span class="math inline">\(y\)</span> is to the son average.</p>
+<p>Here is a plot showing the two regression lines, with blue for the predicting son heights with father heights, and red for predicting father heights with son heights:</p>
 <div class="cell" data-layout-align="center" data-hash="regression_cache/html/two-regression-lines_b2ffe4008e9a6d30d67c6bae8905001b">
 <div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">galton_heights</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">father</span>, <span class="va">son</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
@@ -809,22 +813,22 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 </div>
 </div>
-</section><section id="linear-models" class="level2" data-number="13.9"><h2 data-number="13.9" class="anchored" data-anchor-id="linear-models">
-<span class="header-section-number">13.9</span> Linear models</h2>
-<p>We are now ready to understand the title of this part of the book. Specifically, the connection between regression and <em>linear models</em>. We have described how if data is bivariate normal then the conditional expectations follow the regression line. The fact that the conditional expectation is a line is not an extra assumption but rather a derived result. However, in practice it is common to explicitly write down a model that describes the relationship between two or more variables using a <em>linear model</em>.</p>
-<p>We note that <em>linear</em> here does not refer to lines exclusively, but rather to the fact that the conditional expectation is a linear combination of known quantities. In mathematics, when we multiply each variable by a constant and then add them together, we say we formed a <em>linear combination</em> of the variables. For example, <span class="math inline">\(3x - 4y + 5z\)</span> is a linear combination of <span class="math inline">\(x\)</span>, <span class="math inline">\(y\)</span>, and <span class="math inline">\(z\)</span>. We can also add a constant so <span class="math inline">\(2 + 3x - 4y + 5z\)</span> is also linear combination of <span class="math inline">\(x\)</span>, <span class="math inline">\(y\)</span>, and <span class="math inline">\(z\)</span>.</p>
-<p>We previously described how if <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are bivariate normal, then if we look at only the pairs with <span class="math inline">\(X=x\)</span>, then <span class="math inline">\(Y \mid X=x\)</span> follows a normal distribution with expected value <span class="math inline">\(\mu_Y + \rho \frac{x-\mu_X}{\sigma_X}\sigma_Y\)</span>, which is a linear function of <span class="math inline">\(x\)</span>, and standard deviation <span class="math inline">\(\sigma_Y \sqrt{1-\rho^2}\)</span> that does not depend on <span class="math inline">\(x\)</span>. Note that if we write</p>
+</section><section id="linear-models" class="level2" data-number="14.9"><h2 data-number="14.9" class="anchored" data-anchor-id="linear-models">
+<span class="header-section-number">14.9</span> Linear models</h2>
+<p>We are now ready to understand the title of this part of the book. Specifically, the connection between regression and <em>linear models</em>. We have described how, if data is bivariate normal, then the conditional expectations follow the regression line. The fact that the conditional expectation is a line is not an extra assumption, but rather a derived result. However, in practice it is common to explicitly write down a model that describes the relationship between two or more variables using a <em>linear model</em>.</p>
+<p>We note that <em>linear</em> here does not refer to lines exclusively, but rather to the fact that the conditional expectation is a linear combination of known quantities. In mathematics, when we multiply each variable by a constant and then add them together, we say we formed a <em>linear combination</em> of the variables. For example, <span class="math inline">\(3x - 4y + 5z\)</span> is a linear combination of <span class="math inline">\(x\)</span>, <span class="math inline">\(y\)</span>, and <span class="math inline">\(z\)</span>. We can also add a constant so <span class="math inline">\(2 + 3x - 4y + 5z\)</span> is also a linear combination of <span class="math inline">\(x\)</span>, <span class="math inline">\(y\)</span>, and <span class="math inline">\(z\)</span>.</p>
+<p>We previously described how if <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> are bivariate normal, then if we look at only the pairs with <span class="math inline">\(X=x\)</span>, then <span class="math inline">\(Y \mid X=x\)</span> follows a normal distribution with expected value <span class="math inline">\(\mu_Y + \rho \frac{x-\mu_X}{\sigma_X}\sigma_Y\)</span>, which is a linear function of <span class="math inline">\(x\)</span>, and standard deviation <span class="math inline">\(\sigma_Y \sqrt{1-\rho^2}\)</span> that does not depend on <span class="math inline">\(x\)</span>. Note that if we write:</p>
 <p><span class="math display">\[
 Y = \beta_0 + \beta_1 x + \varepsilon
 \]</span></p>
-<p>then if we assume <span class="math inline">\(\varepsilon\)</span> follows a normal distribution with expected value 0 and fixed standard deviation, then <span class="math inline">\(Y\)</span> has the same properties as the regression setup gave us: it follows a normal distribution, the expected value is a linear function <span class="math inline">\(x\)</span>, and the standard deviation does not depend on <span class="math inline">\(x\)</span>.</p>
+<p>If we assume <span class="math inline">\(\varepsilon\)</span> follows a normal distribution with expected value 0 and fixed standard deviation, then <span class="math inline">\(Y\)</span> has the same properties as the regression setup gave us: it follows a normal distribution, the expected value is a linear function <span class="math inline">\(x\)</span>, and the standard deviation does not depend on <span class="math inline">\(x\)</span>.</p>
 <div class="callout callout-style-simple callout-note">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>In statistical textbooks, the <span class="math inline">\(\varepsilon\)</span>s are referred to as “errors,” which originally represented measurement errors in the initial applications of these models. These errors were associated with inaccuracies in measuring height, weight, or distance. However, the term “error” is now used more broadly, even when the <span class="math inline">\(\varepsilon\)</span>s do not necessarily signify an actual error. For instance, in the case of height, if someone is 2 inches taller than expected based on their parents’ height, those 2 inches should not be considered an error. Despite its lack of descriptive accuracy, the term “error” is employed to elucidate the unexplained variability in the model, unrelated to other included terms.</p>
+<p>In statistical textbooks, the <span class="math inline">\(\varepsilon\)</span>s are referred to as “errors,” which originally represented measurement errors in the initial applications of these models. These errors were associated with inaccuracies in measuring height, weight, or distance. However, the term “error” is now used more broadly, even when the <span class="math inline">\(\varepsilon\)</span>s do not necessarily signify an actual error. For instance, in the case of height, if someone is 2 inches taller than expected, based on their parents’ height, those 2 inches should not be considered an error. Despite its lack of descriptive accuracy, the term “error” is employed to elucidate the unexplained variability in the model, unrelated to other included terms.</p>
 </div>
 </div>
 </div>
@@ -834,42 +838,32 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 \]</span></p>
 <p>Here <span class="math inline">\(x_i\)</span> is the father’s height, which is fixed (not random) due to the conditioning, and <span class="math inline">\(Y_i\)</span> is the random son’s height that we want to predict. We can further assume that <span class="math inline">\(\varepsilon_i\)</span> are independent from each other and all have the same standard deviation.</p>
 <p>In the above model, we know the <span class="math inline">\(x_i\)</span>, but to have a useful model for prediction, we need <span class="math inline">\(\beta_0\)</span> and <span class="math inline">\(\beta_1\)</span>. We estimate these from the data. Once we do this, we can predict son’s heights for any father’s height <span class="math inline">\(x\)</span>. We show how to do this in the next section.</p>
-<p>Although this model is exactly the same one we derived earlier by assuming bivariate normal data, a somewhat nuanced difference is that in the first approach we assumed the data was bivariate normal and the linear model was derived, not assumed. In practice, linear models are just assumed without necessarily assuming normality: the distribution of the <span class="math inline">\(\varepsilon\)</span>s is not necessarily specified. Nevertheless, if your data is bivariate normal, the above linear model holds. If your data is not bivariate normal, then you will need to have other ways of justifying the model.</p>
+<p>Although this model is exactly the same one we derived earlier by assuming bivariate normal data, a somewhat nuanced difference is that, in the first approach, we assumed the data was bivariate normal and the linear model was derived, not assumed. In practice, linear models are just assumed without necessarily assuming normality: the distribution of the <span class="math inline">\(\varepsilon\)</span>s is not necessarily specified. Nevertheless, if your data is bivariate normal, the above linear model holds. If your data is not bivariate normal, then you will need to have other ways of justifying the model.</p>
 <p>One reason linear models are popular is that they are <em>interpretable</em>. In the case of Galton’s data, we can interpret the data like this: due to inherited genes, the son’s height prediction grows by <span class="math inline">\(\beta_1\)</span> for each inch we increase the father’s height <span class="math inline">\(x\)</span>. Because not all sons with fathers of height <span class="math inline">\(x\)</span> are of equal height, we need the term <span class="math inline">\(\varepsilon\)</span>, which explains the remaining variability. This remaining variability includes the mother’s genetic effect, environmental factors, and other biological randomness.</p>
-<p>Given how we wrote the model above, the intercept <span class="math inline">\(\beta_0\)</span> is not very interpretable as it is the predicted height of a son with a father with no height. Due to regression to the mean, the prediction will usually be a bit larger than 0. To make the slope parameter more interpretable, we can rewrite the model slightly as:</p>
+<p>Given how we wrote the model above, the intercept <span class="math inline">\(\beta_0\)</span> is not very interpretable, as it is the predicted height of a son with a father with no height. Due to regression to the mean, the prediction will usually be a bit larger than 0. To make the slope parameter more interpretable, we can rewrite the model slightly as:</p>
 <p><span class="math display">\[
 Y_i = \beta_0 + \beta_1 (x_i - \bar{x}) + \varepsilon_i, \, i=1,\dots,N
 \]</span></p>
-<p>with <span class="math inline">\(\bar{x} = 1/N \sum_{i=1}^N x_i\)</span> the average of the <span class="math inline">\(x\)</span>. In this case <span class="math inline">\(\beta_0\)</span> represents the height when <span class="math inline">\(x_i = \bar{x}\)</span>, which is the height of the son of an average father.</p>
-<p>Later, specifically in Chapters <a href="multivariate-regression.html"><span>Chapter&nbsp;14</span></a> and <span class="citation" data-cites="treatment-effect-models">@treatment-effect-models</span>, we will see how the linear model representation permits us to use the same mathematical frameworks in other contexts and to achieve more complicated goals than predict one variable from another.</p>
-</section><section id="sec-lse" class="level2" data-number="13.10"><h2 data-number="13.10" class="anchored" data-anchor-id="sec-lse">
-<span class="header-section-number">13.10</span> Least Squares Estimates</h2>
-<p>For linear models to be useful, we have to estimate the unknown <span class="math inline">\(\beta\)</span>s. The standard approach in science is to find the values that minimize the distance of the fitted model to the data. The following is called the least squares (LS) equation and we will see it often in this chapter. For Galton’s data, we would write:</p>
+<p>with <span class="math inline">\(\bar{x} = 1/N \sum_{i=1}^N x_i\)</span> the average of the <span class="math inline">\(x\)</span>. In this case, <span class="math inline">\(\beta_0\)</span> represents the height when <span class="math inline">\(x_i = \bar{x}\)</span>, which is the height of the son of an average father.</p>
+<p>Later, specifically in Sections <a href="multivariate-regression.html"><span>Chapter&nbsp;15</span></a> and <span class="citation" data-cites="treatment-effect-models">@treatment-effect-models</span>, we will see how the linear model representation permits us to use the same mathematical frameworks in other contexts and to achieve more complicated goals than predict one variable from another.</p>
+</section><section id="sec-lse" class="level2" data-number="14.10"><h2 data-number="14.10" class="anchored" data-anchor-id="sec-lse">
+<span class="header-section-number">14.10</span> Least Squares Estimates</h2>
+<p>For linear models to be useful, we have to estimate the unknown <span class="math inline">\(\beta\)</span>s. The standard approach is to find the values that minimize the distance of the fitted model to the data. Specifically, we find the <span class="math inline">\(\beta\)</span>s that minize the least squares (LS) equation show below. For Galton’s data, the LS equation looks like this:</p>
 <p><span class="math display">\[
 RSS = \sum_{i=1}^n \left\{  y_i - \left(\beta_0 + \beta_1 x_i \right)\right\}^2
 \]</span></p>
-<p>This quantity is called the residual sum of squares (RSS). Once we find the values that minimize the RSS, we will call the values the least squares estimates (LSE) and denote them with <span class="math inline">\(\hat{\beta}_0\)</span> and <span class="math inline">\(\hat{\beta}_1\)</span>. Let’s demonstrate this with the previously defined dataset:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-17_ccc3d830a672203f48fe3c34f81e23f1">
-<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">HistData</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1983</span><span class="op">)</span></span>
-<span><span class="va">galton_heights</span> <span class="op">&lt;-</span> <span class="va">GaltonFamilies</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">gender</span> <span class="op">==</span> <span class="st">"male"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">family</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/sample_n.html">sample_n</a></span><span class="op">(</span><span class="fl">1</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">ungroup</a></span><span class="op">(</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">father</span>, <span class="va">childHeight</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/rename.html">rename</a></span><span class="op">(</span>son <span class="op">=</span> <span class="va">childHeight</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Let’s write a function that computes the RSS for any pair of values <span class="math inline">\(\beta_0\)</span> and <span class="math inline">\(\beta_1\)</span>.</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-18_3c0dd5a45f404afa4fda8e523c10e5f3">
-<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">rss</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">beta0</span>, <span class="va">beta1</span>, <span class="va">data</span><span class="op">)</span><span class="op">{</span></span>
+<p>The quantity we try to minimize is called the residual sum of squares (RSS).</p>
+<p>Once we find the values that minimize the RSS, we will call the values the least squares estimates (LSE) and denote them by placing a <em>hat</em> over the parameters. In our example we use <span class="math inline">\(\hat{\beta}_0\)</span> and <span class="math inline">\(\hat{\beta}_1\)</span>.</p>
+<p>We will demonstrate how we find these values using the previously defined <code>galton_heights</code> dataset. Let’s start bywriting a function that computes the RSS for any pair of values <span class="math inline">\(\beta_0\)</span> and <span class="math inline">\(\beta_1\)</span>.</p>
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-17_20c5e3fb1843070615c4697b8b6b702f">
+<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">rss</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">beta0</span>, <span class="va">beta1</span>, <span class="va">data</span><span class="op">)</span><span class="op">{</span></span>
 <span>  <span class="va">resid</span> <span class="op">&lt;-</span> <span class="va">galton_heights</span><span class="op">$</span><span class="va">son</span> <span class="op">-</span> <span class="op">(</span><span class="va">beta0</span> <span class="op">+</span> <span class="va">beta1</span><span class="op">*</span><span class="va">galton_heights</span><span class="op">$</span><span class="va">father</span><span class="op">)</span></span>
 <span>  <span class="kw"><a href="https://rdrr.io/r/base/function.html">return</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">resid</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="op">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>So for any pair of values, we get an RSS. Here is a plot of the RSS as a function of <span class="math inline">\(\beta_1\)</span> when we keep the <span class="math inline">\(\beta_0\)</span> fixed at 25.</p>
+<p>So for any pair of values, we get an RSS. Here is a plot of the RSS as a function of <span class="math inline">\(\beta_1\)</span>, when we keep the <span class="math inline">\(\beta_0\)</span> fixed at 25.</p>
 <div class="cell" data-layout-align="center" data-hash="regression_cache/html/rss-versus-estimate_9c71aab1a395a15bb42a25424133c2f5">
-<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">beta1</span> <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span>, length <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">galton_heights</span><span class="op">)</span><span class="op">)</span></span>
+<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">beta1</span> <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span>, length <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">galton_heights</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">results</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>beta1 <span class="op">=</span> <span class="va">beta1</span>,</span>
 <span>                      rss <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">beta1</span>, <span class="va">rss</span>, beta0 <span class="op">=</span> <span class="fl">25</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">results</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">beta1</span>, <span class="va">rss</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_path.html">geom_line</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span> </span>
@@ -882,24 +876,24 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 </div>
 <p>We can see a clear minimum for <span class="math inline">\(\beta_1\)</span> at around 0.65. However, this minimum for <span class="math inline">\(\beta_1\)</span> is for when <span class="math inline">\(\beta_0 = 25\)</span>, a value we arbitrarily picked. We don’t know if (25, 0.65) is the pair that minimizes the equation across all possible pairs.</p>
-<p>Trial and error is not going to work in this case. We could search for a minimum within a fine grid of <span class="math inline">\(\beta_0\)</span> and <span class="math inline">\(\beta_1\)</span> values, but this is unnecessarily time-consuming since we can use calculus: take the partial derivatives, set them to 0 and solve for <span class="math inline">\(\beta_1\)</span> and <span class="math inline">\(\beta_2\)</span>. Of course, if we have many parameters, these equations can get rather complex. But there are functions in R that do these calculations for us. We will learn these next. To learn the mathematics behind this, you can consult a book on linear models.</p>
-</section><section id="the-lm-function" class="level2" data-number="13.11"><h2 data-number="13.11" class="anchored" data-anchor-id="the-lm-function">
-<span class="header-section-number">13.11</span> The <code>lm</code> function</h2>
+<p>Trial and error is not going to work in this case. We could search for a minimum within a fine grid of <span class="math inline">\(\beta_0\)</span> and <span class="math inline">\(\beta_1\)</span> values, but this is unnecessarily time-consuming since we can use calculus. Specifically, we take the partial derivatives, set them to 0, and solve for <span class="math inline">\(\beta_1\)</span> and <span class="math inline">\(\beta_2\)</span>. Of course, if we have many parameters, these equations can get rather complex. But there are functions in R that do these calculations for us. We will study these next. To learn the mathematics behind this, you can consult a book on linear models.</p>
+</section><section id="the-lm-function" class="level2" data-number="14.11"><h2 data-number="14.11" class="anchored" data-anchor-id="the-lm-function">
+<span class="header-section-number">14.11</span> The <code>lm</code> function</h2>
 <p>In R, we can obtain the least squares estimates using the <code>lm</code> function. To fit the model:</p>
 <p><span class="math display">\[
 Y_i = \beta_0 + \beta_1 x_i + \varepsilon_i
 \]</span></p>
-<p>with <span class="math inline">\(Y_i\)</span> the son’s height and <span class="math inline">\(x_i\)</span> the father’s height, we can use this code to obtain the least squares estimates.</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-19_3a00be076391f92c03a8fc4c16a3b6e2">
-<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">son</span> <span class="op">~</span> <span class="va">father</span>, data <span class="op">=</span> <span class="va">galton_heights</span><span class="op">)</span></span>
-<span><span class="va">fit</span><span class="op">$</span><span class="va">coef</span></span>
+<p>with <span class="math inline">\(Y_i\)</span> being the son’s height and <span class="math inline">\(x_i\)</span> being the father’s height, we can use this code to obtain the least squares estimates.</p>
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-18_bfe690a728f701d0e21ca29610354fc1">
+<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">son</span> <span class="op">~</span> <span class="va">father</span>, data <span class="op">=</span> <span class="va">galton_heights</span><span class="op">)</span></span>
+<span><span class="va">fit</span><span class="op">$</span><span class="va">coefficients</span></span>
 <span><span class="co">#&gt; (Intercept)      father </span></span>
 <span><span class="co">#&gt;      37.288       0.461</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The most common way we use <code>lm</code> is by using the character <code>~</code> to let <code>lm</code> know which is the variable we are predicting (left of <code>~</code>) and which we are using to predict (right of <code>~</code>). The intercept is added automatically to the model that will be fit.</p>
 <p>The object <code>fit</code> includes more information about the fit. We can use the function <code>summary</code> to extract more of this information (not shown):</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-20_ec6208948f34d69827009ccb9bf82ebe">
-<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/summary.html">summary</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-19_6502390bfccc0b1fb63979f561a1f075">
+<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/summary.html">summary</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span></span>
 <span><span class="co">#&gt; </span></span>
 <span><span class="co">#&gt; Call:</span></span>
 <span><span class="co">#&gt; lm(formula = son ~ father, data = galton_heights)</span></span>
@@ -919,13 +913,13 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 <span><span class="co">#&gt; Multiple R-squared:  0.188,  Adjusted R-squared:  0.183 </span></span>
 <span><span class="co">#&gt; F-statistic: 40.9 on 1 and 177 DF,  p-value: 1.36e-09</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To understand some of the information included in this summary we need to remember that the LSE are random variables. Mathematical statistics gives us some ideas of the distribution of these random variables.</p>
-<p>In Chapter @, after describing a more complex case study, we learn more about applying regression in R.</p>
-</section><section id="lse-are-random-variables" class="level2" data-number="13.12"><h2 data-number="13.12" class="anchored" data-anchor-id="lse-are-random-variables">
-<span class="header-section-number">13.12</span> LSE are random variables</h2>
+<p>To understand some of the information included in this summary, we need to remember that the LSE are random variables. Mathematical statistics gives us some ideas of the distribution of these random variables.</p>
+<p>In <a href="multivariate-regression.html"><span>Chapter&nbsp;15</span></a>, after describing a more complex case study, we gain further insights into the application of regression in R.</p>
+</section><section id="lse-are-random-variables" class="level2" data-number="14.12"><h2 data-number="14.12" class="anchored" data-anchor-id="lse-are-random-variables">
+<span class="header-section-number">14.12</span> LSE are random variables</h2>
 <p>The LSE is derived from the data <span class="math inline">\(y_1,\dots,y_N\)</span>, which are a realization of random variables <span class="math inline">\(Y_1, \dots, Y_N\)</span>. This implies that our estimates are random variables. To see this, we can run a Monte Carlo simulation in which we assume the son and father height data defines a population, take a random sample of size <span class="math inline">\(N=50\)</span>, and compute the regression slope coefficient for each one:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-21_29ca00752007610e8bb72640661d7ac1">
-<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-20_93000b0afc8f6ee123db0667b04a5ec0">
+<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
 <span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">50</span></span>
 <span><span class="va">lse</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="op">{</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/sample_n.html">sample_n</a></span><span class="op">(</span><span class="va">galton_heights</span>, <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
@@ -948,9 +942,10 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 </div>
 </div>
-<p>The reason these look normal is because the central limit theorem applies here as well: for large enough <span class="math inline">\(N\)</span>, the least squares estimates will be approximately normal with expected value <span class="math inline">\(\beta_0\)</span> and <span class="math inline">\(\beta_1\)</span>, respectively. The standard errors are a bit complicated to compute, but mathematical theory does allow us to compute them and they are included in the summary provided by the <code>lm</code> function. Here it is for one of our simulated data sets:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-22_415e7822b33d652a14b9e70ae84237ad">
-<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/sample_n.html">sample_n</a></span><span class="op">(</span><span class="va">galton_heights</span>, <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<p>The reason these look normal is because the central limit theorem applies here as well: for large enough <span class="math inline">\(N\)</span>, the least squares estimates will be approximately normal with expected value <span class="math inline">\(\beta_0\)</span> and <span class="math inline">\(\beta_1\)</span>, respectively.</p>
+<p>The standard errors are a bit complicated to compute, but mathematical theory does allow us to compute them and they are included in the summary provided by the <code>lm</code> function. The function <code>summary</code> shows us the standard error estimates:</p>
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-21_d08784ef129b690f70c8f5d3f4f59b0f">
+<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/sample_n.html">sample_n</a></span><span class="op">(</span><span class="va">galton_heights</span>, <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">son</span> <span class="op">~</span> <span class="va">father</span>, data <span class="op">=</span> <span class="va">_</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/summary.html">summary</a></span><span class="op">(</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/stats/coef.html">coef</a></span><span class="op">(</span><span class="op">)</span></span>
@@ -958,23 +953,23 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 <span><span class="co">#&gt; (Intercept)    19.28     11.656    1.65 1.05e-01</span></span>
 <span><span class="co">#&gt; father          0.72      0.169    4.25 9.79e-05</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>You can see that the standard errors estimates reported by the <code>summary</code> are close to the standard errors from the simulation:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-23_7fea877cb0668c054bc2039b5db2c69f">
-<div class="sourceCode" id="cb33"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">lse</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>se_0 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">beta_0</span><span class="op">)</span>, se_1 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">beta_1</span><span class="op">)</span><span class="op">)</span></span>
+<p>You can see that the standard errors estimates reported above are close to the standard errors from the simulation:</p>
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-22_5997c14a097cf429ca2f734bf49d3bf3">
+<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">lse</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>se_0 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">beta_0</span><span class="op">)</span>, se_1 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">beta_1</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt;   se_0  se_1</span></span>
 <span><span class="co">#&gt; 1 8.84 0.128</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The <code>summary</code> function also reports t-statistics (<code>t value</code>) and p-values (<code>Pr(&gt;|t|)</code>). The t-statistic is not actually based on the central limit theorem but rather on the assumption that the <span class="math inline">\(\varepsilon\)</span>s follow a normal distribution. Under this assumption, mathematical theory tells us that the LSE divided by their standard error, <span class="math inline">\(\hat{\beta}_0 / \hat{\mbox{SE}}(\hat{\beta}_0 )\)</span> and <span class="math inline">\(\hat{\beta}_1 / \hat{\mbox{SE}}(\hat{\beta}_1 )\)</span>, follow a t-distribution with <span class="math inline">\(N-p\)</span> degrees of freedom, with <span class="math inline">\(p\)</span> the number of parameters in our model. In the case of height <span class="math inline">\(p=2\)</span>, the two p-values are testing the null hypothesis that <span class="math inline">\(\beta_0 = 0\)</span> and <span class="math inline">\(\beta_1=0\)</span>, respectively.</p>
-<p>Remember that, as we described in Section <a href="../inference/models.html#sec-t-dist"><span>Section&nbsp;10.2.3</span></a> for large enough <span class="math inline">\(N\)</span>, the CLT works and the t-distribution becomes almost the same as the normal distribution. Also, notice that we can construct confidence intervals, but we will soon learn about <strong>broom</strong>, an add-on package that makes this easy.</p>
+<p>The <code>summary</code> function also reports t-statistics (<code>t value</code>) and p-values (<code>Pr(&gt;|t|)</code>). The t-statistic is not actually based on the central limit theorem, but rather on the assumption that the <span class="math inline">\(\varepsilon\)</span>s follow a normal distribution. Under this assumption, mathematical theory tells us that the LSE divided by their standard error, <span class="math inline">\(\hat{\beta}_0 / \hat{\mbox{SE}}(\hat{\beta}_0 )\)</span> and <span class="math inline">\(\hat{\beta}_1 / \hat{\mbox{SE}}(\hat{\beta}_1 )\)</span>, follow a t-distribution with <span class="math inline">\(N-p\)</span> degrees of freedom, with <span class="math inline">\(p\)</span> the number of parameters in our model. In our example <span class="math inline">\(p=2\)</span>, and the two p-values are obtained from testing the null hypothesis that <span class="math inline">\(\beta_0 = 0\)</span> and <span class="math inline">\(\beta_1=0\)</span>, respectively.</p>
+<p>Remember that, as we described in Section <a href="../inference/models.html#sec-t-dist"><span>Section&nbsp;11.2.3</span></a>, for large enough <span class="math inline">\(N\)</span>, the CLT works and the t-distribution becomes almost the same as the normal distribution. Also, notice that we can construct confidence intervals, but we will soon learn about <strong>broom</strong>, an add-on package that makes this easy.</p>
 <p>Although we do not show examples in this book, hypothesis testing with regression models is commonly used in epidemiology and economics to make statements such as “the effect of A on B was statistically significant after adjusting for X, Y, and Z”. However, several assumptions have to hold for these statements to be true.</p>
-</section><section id="predicted-values-are-random-variables" class="level2" data-number="13.13"><h2 data-number="13.13" class="anchored" data-anchor-id="predicted-values-are-random-variables">
-<span class="header-section-number">13.13</span> Predicted values are random variables</h2>
+</section><section id="predicted-values-are-random-variables" class="level2" data-number="14.13"><h2 data-number="14.13" class="anchored" data-anchor-id="predicted-values-are-random-variables">
+<span class="header-section-number">14.13</span> Predicted values are random variables</h2>
 <p>Once we fit our model, we can obtain prediction of <span class="math inline">\(Y\)</span> by plugging in the estimates into the regression model. For example, if the father’s height is <span class="math inline">\(x\)</span>, then our prediction <span class="math inline">\(\hat{Y}\)</span> for the son’s height will be:</p>
 <p><span class="math display">\[\hat{Y} = \hat{\beta}_0 + \hat{\beta}_1 x\]</span></p>
 <p>When we plot <span class="math inline">\(\hat{Y}\)</span> versus <span class="math inline">\(x\)</span>, we see the regression line.</p>
 <p>Keep in mind that the prediction <span class="math inline">\(\hat{Y}\)</span> is also a random variable and mathematical theory tells us what the standard errors are. If we assume the errors are normal, or have a large enough sample size, we can use theory to construct confidence intervals as well. In fact, the <strong>ggplot2</strong> layer <code>geom_smooth(method = "lm")</code> that we previously used plots <span class="math inline">\(\hat{Y}\)</span> and surrounds it by confidence intervals:</p>
 <div class="cell" data-layout-align="center" data-hash="regression_cache/html/father-son-regression_55eb9e2075c5726622a3580334548162">
-<div class="sourceCode" id="cb34"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">galton_heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">son</span>, <span class="va">father</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
+<div class="sourceCode" id="cb33"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">galton_heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">son</span>, <span class="va">father</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_smooth.html">geom_smooth</a></span><span class="op">(</span>method <span class="op">=</span> <span class="st">"lm"</span><span class="op">)</span></span>
 <span><span class="co">#&gt; `geom_smooth()` using formula = 'y ~ x'</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -987,50 +982,50 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 <p>The R function <code>predict</code> takes an <code>lm</code> object as input and returns the prediction. If requested, the standard errors and other information from which we can construct confidence intervals is provided:</p>
 <div class="cell" data-layout-align="center" data-hash="regression_cache/html/father-son-predictor_b952bb5ae33c0affdec8fb6810517399">
-<div class="sourceCode" id="cb35"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="va">galton_heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">son</span> <span class="op">~</span> <span class="va">father</span>, data <span class="op">=</span> <span class="va">_</span><span class="op">)</span> </span>
+<div class="sourceCode" id="cb34"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="va">galton_heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">son</span> <span class="op">~</span> <span class="va">father</span>, data <span class="op">=</span> <span class="va">_</span><span class="op">)</span> </span>
 <span></span>
 <span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit</span>, se.fit <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
 <span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">y_hat</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] "fit"            "se.fit"         "df"             "residual.scale"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="diagnostic-plots" class="level2" data-number="13.14"><h2 data-number="13.14" class="anchored" data-anchor-id="diagnostic-plots">
-<span class="header-section-number">13.14</span> Diagnostic plots</h2>
-<p>When the linear model is assumed rather than derived, all interpretations depend on the usefulness of the model. The <code>lm</code> function will fit the model and return summaries even when the model is wrong and unuseful.</p>
-<p>Visually inspecting residuals, defined as the difference between observed values and predicted values</p>
+</section><section id="diagnostic-plots" class="level2" data-number="14.14"><h2 data-number="14.14" class="anchored" data-anchor-id="diagnostic-plots">
+<span class="header-section-number">14.14</span> Diagnostic plots</h2>
+<p>When the linear model is assumed, rather than derived, all interpretations depend on the usefulness of the model. The <code>lm</code> function will fit the model and return summaries even when the model is wrong and not useful.</p>
+<p>Visually inspecting residuals, defined as the difference between observed values and predicted values:</p>
 <p><span class="math display">\[
 r = Y - \hat{Y} = Y - \left(\hat{\beta}_0 - \hat{\beta}_1 x_i\right),
-\]</span> and summaries of the residuals, is a powerful way to diagnose if the model is useful. Note that the residuals can be thought of estimates of the errors since</p>
+\]</span> and summaries of the residuals, is a powerful way to diagnose if the model is useful. Note that the residuals can be thought of estimates of the errors since:</p>
 <p><span class="math display">\[
 \varepsilon = Y - \left(\beta_0 + \beta_1 x_i \right).
-\]</span> In fact residuals are often denoted as <span class="math inline">\(\hat{\varepsilon}\)</span>. This motivates several <em>diagnostic</em> plots. Becasue we obervere, <span class="math inline">\(r\)</span> but don’t observe <span class="math inline">\(\varepsilon\)</span>, we based the plots on the residuals.</p>
+\]</span> In fact residuals are often denoted as <span class="math inline">\(\hat{\varepsilon}\)</span>. This motivates several <em>diagnostic</em> plots. Because we observe <span class="math inline">\(r\)</span>, but don’t observe <span class="math inline">\(\varepsilon\)</span>, we based the plots on the residuals.</p>
 <ol type="1">
 <li><p>Because the errors are assumed not to depend on the expected value of <span class="math inline">\(Y\)</span>, a plot of <span class="math inline">\(r\)</span> versus the fitted values <span class="math inline">\(\hat{Y}\)</span> should show no relationship.</p></li>
-<li><p>In cases in which we assume the errors follow a normal distribtuion a qqplot of standardized <span class="math inline">\(r\)</span> should fall on a line when plotted against theoretical quantiles.</p></li>
+<li><p>In cases in which we assume the errors follow a normal distribution, a qqplot of standardized <span class="math inline">\(r\)</span> should fall on a line when plotted against theoretical quantiles.</p></li>
 <li><p>Because we assume the standard deviation of the errors is constant, if we plot the absolute value of the residuals, it should appear constant.</p></li>
 </ol>
 <p>We prefer plots rather than summaries based on, for example, correlation because, as noted in Section <span class="citation" data-cites="ascombe">@ascombe</span>, correlation is not always the best summary of association. The function <code>plot</code> applied to an <code>lm</code> object automatically plots these.</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-24_c750e80d48b859a946f6acc8972ce69b">
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-23_1b72ced33c3e63a9963a3dc964e325ab">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="regression_files/figure-html/unnamed-chunk-24-1.png" class="img-fluid figure-img" style="width:100.0%"></p>
+<figure class="figure"><p><img src="regression_files/figure-html/unnamed-chunk-23-1.png" class="img-fluid figure-img" style="width:100.0%"></p>
 </figure>
 </div>
 </div>
 </div>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-25_0858d12bdd0ad3c741d95193bbcca158">
-<div class="sourceCode" id="cb36"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">fit</span>, which <span class="op">=</span> <span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-24_5e04dc0409225d10a9ffff749f392f15">
+<div class="sourceCode" id="cb35"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">fit</span>, which <span class="op">=</span> <span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This function can produce six different plots, and the argument <code>which</code> let’s you specify which you want to see. You can learn more by reading the <code>plot.lm</code> help file. However, some of the plots are based on more advanced concepts beyond the scope of this book. To learn more we recommend an advanced book on regression analysis.</p>
-<p>In Chapters <a href="multivariate-regression.html"><span>Chapter&nbsp;14</span></a> and <a href="treatment-effect-models.html"><span>Chapter&nbsp;16</span></a> we introduce data analysis challenges in which more than one variables some not included in the model. In these cases an important diagnostic test to add checks if the residuals are related to variables not included in the model.</p>
-</section><section id="the-regression-fallacy" class="level2" data-number="13.15"><h2 data-number="13.15" class="anchored" data-anchor-id="the-regression-fallacy">
-<span class="header-section-number">13.15</span> The regression fallacy</h2>
+<p>This function can produce six different plots, and the argument <code>which</code> let’s you specify which you want to see. You can learn more by reading the <code>plot.lm</code> help file. However, some of the plots are based on more advanced concepts beyond the scope of this book. To learn more, we recommend an advanced book on regression analysis.</p>
+<p>In <a href="multivariate-regression.html"><span>Chapter&nbsp;15</span></a> and <a href="treatment-effect-models.html"><span>Chapter&nbsp;17</span></a>, we introduce data analysis challenges in which we may decide to not to include certain variables in the model. In these cases, an important diagnostic test to add checks if the residuals are related to variables not included in the model.</p>
+</section><section id="the-regression-fallacy" class="level2" data-number="14.15"><h2 data-number="14.15" class="anchored" data-anchor-id="the-regression-fallacy">
+<span class="header-section-number">14.15</span> The regression fallacy</h2>
 <p>Wikipedia defines the <em>sophomore slump</em> as:</p>
 <blockquote class="blockquote">
 <p>A sophomore slump or sophomore jinx or sophomore jitters refers to an instance in which a second, or sophomore, effort fails to live up to the standards of the first effort. It is commonly used to refer to the apathy of students (second year of high school, college or university), the performance of athletes (second season of play), singers/bands (second album), television shows (second seasons) and movies (sequels/prequels).</p>
 </blockquote>
-<p>In Major League Baseball, the rookie of the year (ROY) award is given to the first-year player who is judged to have performed the best. The <em>sophmore slump</em> phrase is used to describe the observation that ROY award winners don’t do as well during their second year. For example, this Fox Sports article<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a> asks “Will MLB’s tremendous rookie class of 2015 suffer a sophomore slump?”.</p>
-<p>Does the data confirm the existence of a sophomore slump? Let’s take a look. Examining the data for widely used measure of success, the batting average, we see that this observation holds true for the top performing ROYs:</p>
+<p>In Major League Baseball, the rookie of the year (ROY) award is given to the first-year player who is judged to have performed the best. The <em>sophomore slump</em> phrase is used to describe the observation that ROY award winners don’t do as well during their second year. For example, this Fox Sports article<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a> asks “Will MLB’s tremendous rookie class of 2015 suffer a sophomore slump?”</p>
+<p>Does the data confirm the existence of a sophomore slump? Let’s take a look. Examining the data for a widely used measure of success, the batting average, we see that this observation holds true for the top performing ROYs:</p>
 <div class="cell" data-layout-align="center">
 <div class="cell-output-display">
 <table class="table table-striped table-sm small" data-quarto-postprocess="true">
@@ -1081,10 +1076,10 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </table>
 </div>
 </div>
-<p>In fact, the proportion of players that have a lower batting average their sophomore year is 0.6981132.</p>
-<p>So is it “jitters” or “jinx”? To answer this question, let’s turn our attention to all players that played the 2013 and 2014 seasons and batted more than 130 times (minimum to win Rookie of the Year).</p>
+<p>In fact, the proportion of players that have a lower batting average during their sophomore year is 0.6981132.</p>
+<p>So is it “jitters” or “jinx”? To answer this question, let’s turn our attention to all the players that played the 2013 and 2014 seasons and batted more than 130 times (minimum to win Rookie of the Year).</p>
 <p>The same pattern arises when we look at the top performers: batting averages go down for most of the top performers.</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-28_740040d04885b1ebf77cc020ca92564a">
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-27_1acec4a1e30b3794a348619d2e1bc5b0">
 <div class="cell-output-display">
 <table class="table table-striped table-sm small" data-quarto-postprocess="true">
 <thead><tr class="header">
@@ -1129,7 +1124,7 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 </div>
 <p>But these are not rookies! Also, look at what happens to the worst performers of 2013:</p>
-<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-29_bf12757ad294264e14cc8272b0622d26">
+<div class="cell" data-layout-align="center" data-hash="regression_cache/html/unnamed-chunk-28_f44842b02ca56caacefb42cffc1bd3db">
 <div class="cell-output-display">
 <table class="table table-striped table-sm small" data-quarto-postprocess="true">
 <thead><tr class="header">
@@ -1173,7 +1168,7 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </table>
 </div>
 </div>
-<p>Their batting averages mostly go up! Is this some sort of reverse sophomore slump? It is not. There is no such thing as the sophomore slump. This is all explained with a simple statistical fact: the correlation for performance in two separate years is high, but not perfect:</p>
+<p>Their batting averages mostly go up! Is this some sort of reverse sophomore slump? It is not. There is no such thing as a sophomore slump. This is all explained with a simple statistical fact: the correlation for performance in two separate years is high, but not perfect:</p>
 <div class="cell" data-layout-align="center" data-hash="regression_cache/html/regression-fallacy_de4d22df83f14445fb1573e5f46ee220">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -1184,9 +1179,9 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </div>
 <p>The correlation is 0.460254 and the data look very much like a bivariate normal distribution, which means we predict a 2014 batting average <span class="math inline">\(Y\)</span> for any given player that had a 2013 batting average <span class="math inline">\(X\)</span> with:</p>
 <p><span class="math display">\[ \frac{Y - .255}{.032} = 0.46 \left( \frac{X - .261}{.023}\right) \]</span></p>
-<p>Because the correlation is not perfect, regression tells us that, on average, expect high performers from 2013 to do a bit worse in 2014. It’s not a jinx; it’s just due to chance. The ROY are selected from the top values of <span class="math inline">\(X\)</span> so it is expected that <span class="math inline">\(Y\)</span> will regress to the mean.</p>
-</section><section id="exercises" class="level2" data-number="13.16"><h2 data-number="13.16" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">13.16</span> Exercises</h2>
+<p>Because the correlation is not perfect, regression tells us that, on average, expect high performers from 2013 to do a bit worse in 2014. It’s not a jinx; it’s just due to chance. The ROY are selected from the top values of <span class="math inline">\(X\)</span>, so it is expected that <span class="math inline">\(Y\)</span> will regress to the mean.</p>
+</section><section id="exercises" class="level2" data-number="14.16"><h2 data-number="14.16" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">14.16</span> Exercises</h2>
 <p>1. Load the <code>GaltonFamilies</code> data from the <strong>HistData</strong>. The children in each family are listed by gender and then by height. Create a dataset called <code>galton_heights</code> by picking a male and female at random.</p>
 <p>2. Make a scatterplot for heights between mothers and daughters, mothers and sons, fathers and daughters, and fathers and sons.</p>
 <p>3. Compute the correlation in heights between mothers and daughters, mothers and sons, fathers and daughters, and fathers and sons.</p>
@@ -1195,7 +1190,7 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
 </section><section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes"><hr>
 <ol>
 <li id="fn1"><p>https://en.wikipedia.org/wiki/Francis_Galton<a href="#fnref1" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
-<li id="fn2"><p>http://www.foxsports.com/mlb/story/kris-bryant-carlos-correa-rookies-of-year-award-matt-duffy-francisco-lindor-kang-sano-120715<a href="#fnref2" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
+<li id="fn2"><p>https://web.archive.org/web20160815063904/http://www.foxsports.com/mlb/story/kris-bryant-carlos-correa-rookies-of-year-award-matt-duffy-francisco-lindor-kang-sano-120715<a href="#fnref2" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
 </ol></section></main><!-- /main --><script id="quarto-html-after-body" type="application/javascript">
 window.document.addEventListener("DOMContentLoaded", function (event) {
   const toggleBodyColorMode = (bsSheetEl) => {
@@ -1435,7 +1430,7 @@ <h1 class="title"><span id="sec-regression" class="quarto-section-identifier"><s
   </div>
   <div class="nav-page nav-page-next">
       <a href="../linear-models/multivariate-regression.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/linear-models/regression_files/figure-html/unnamed-chunk-24-1.png b/docs/linear-models/regression_files/figure-html/unnamed-chunk-24-1.png
deleted file mode 100644
index b1b9578..0000000
Binary files a/docs/linear-models/regression_files/figure-html/unnamed-chunk-24-1.png and /dev/null differ
diff --git a/docs/linear-models/treatment-effect-models.html b/docs/linear-models/treatment-effect-models.html
index e1d5b97..6d534c7 100644
--- a/docs/linear-models/treatment-effect-models.html
+++ b/docs/linear-models/treatment-effect-models.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 16&nbsp; Treatment effect models</title>
+<title>Advanced Data Science - 17&nbsp; Treatment effect models</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../linear-models/intro-to-linear-models.html">Linear Models</a></li><li class="breadcrumb-item"><a href="../linear-models/treatment-effect-models.html"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../linear-models/intro-to-linear-models.html">Linear Models</a></li><li class="breadcrumb-item"><a href="../linear-models/treatment-effect-models.html"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,16 +405,23 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#comparing-group-means" id="toc-comparing-group-means" class="nav-link active" data-scroll-target="#comparing-group-means"><span class="header-section-number">16.1</span> Comparing group means</a></li>
-  <li><a href="#one-factor-design" id="toc-one-factor-design" class="nav-link" data-scroll-target="#one-factor-design"><span class="header-section-number">16.2</span> One factor design</a></li>
-  <li><a href="#two-factor-designs" id="toc-two-factor-designs" class="nav-link" data-scroll-target="#two-factor-designs"><span class="header-section-number">16.3</span> Two factor designs</a></li>
-  <li><a href="#analysis-of-variance" id="toc-analysis-of-variance" class="nav-link" data-scroll-target="#analysis-of-variance"><span class="header-section-number">16.4</span> Analysis of variance</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">16.5</span> Exercises</a></li>
+<li><a href="#comparing-group-means" id="toc-comparing-group-means" class="nav-link active" data-scroll-target="#comparing-group-means"><span class="header-section-number">17.1</span> Comparing group means</a></li>
+  <li><a href="#one-factor-design" id="toc-one-factor-design" class="nav-link" data-scroll-target="#one-factor-design"><span class="header-section-number">17.2</span> One factor design</a></li>
+  <li><a href="#two-factor-designs" id="toc-two-factor-designs" class="nav-link" data-scroll-target="#two-factor-designs"><span class="header-section-number">17.3</span> Two factor designs</a></li>
+  <li><a href="#contrasts" id="toc-contrasts" class="nav-link" data-scroll-target="#contrasts"><span class="header-section-number">17.4</span> Contrasts</a></li>
+  <li>
+<a href="#sec-anova" id="toc-sec-anova" class="nav-link" data-scroll-target="#sec-anova"><span class="header-section-number">17.5</span> Analysis of variance (ANOVA)</a>
+  <ul class="collapse">
+<li><a href="#multiple-factors" id="toc-multiple-factors" class="nav-link" data-scroll-target="#multiple-factors"><span class="header-section-number">17.5.1</span> Multiple factors</a></li>
+  <li><a href="#array-representation" id="toc-array-representation" class="nav-link" data-scroll-target="#array-representation"><span class="header-section-number">17.5.2</span> Array representation</a></li>
+  </ul>
+</li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">17.6</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/linear-models/treatment-effect-models.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
-<h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-identifier"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></h1>
+<h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-identifier"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></h1>
 </div>
 
 
@@ -421,9 +434,10 @@ <h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-i
   </div>
   
 
-</header><p>Up to now, all our linear models have been applied to two or more continuous random variables. We assume the random variables are multivariate normal and use this to motivate a linear model. This approach covers many real-life examples of linear regression. However, linear models have many other applications. One of the most popular is to quantify treatment effects in randomized and controlled experiments. One of the first applications was in agriculture, where different plots of lands were treated with different combinations of fertilizers to try to determine if they were effective. In fact the use of <span class="math inline">\(Y\)</span> for the outcome in Statistics, is due to the mathematical theory being developed for crop <em>yield</em> as the outcome.</p>
-<p>Since, the same ideas have been applied in other areas, such as randomized trials developed to determine if drugs cure or prevent a diseases or if policies have an effect on social or educational outcomes. In the latter example, we think of the policy intervention as a <em>treatment</em> and follow the same mathematical procedure. The analyses used in <em>A/B testing</em>, widely used today by internet companies, are based on treatment effects models. Furthermore, the use of these models has been extended to observational studies where analysts attempt to use linear models to estimate effects of interest while accounting for potential confounders. For example, to estimate the effect of a diet high on fruits and vegetables on blood pleasure, we would have to adjust for <em>factors</em> such as age, sex, and smoking status.</p>
-<p>In this chapter we consider an experiment testing for the effect of a high-fat diet on mouse physiology. Mice were selected and divided at random into two groups, one group receiving a high-fat diet, considered the <em>treatment</em> and the other group left as control and receiving the usual <em>chow</em> diet. The data is included in the <strong>dslabs</strong> package:</p>
+</header><p>Up to now, all our linear models have been applied to two or more continuous random variables. We assume the random variables are multivariate normal and use this to motivate a linear model. This approach covers many real-life examples of linear regression. However, linear models have many other applications. One of the most popular is to quantify treatment effects in randomized and controlled experiments. One of the first applications was in agriculture, where different plots of lands were treated with different combinations of fertilizers to try to determine if they were effective. In fact, the use of <span class="math inline">\(Y\)</span> for the outcome in statistics is due to the mathematical theory being developed for crop <em>yield</em> as the outcome.</p>
+<p>Since then, the same ideas have been applied in other areas, such as randomized trials developed to determine if drugs cure or prevent diseases or if policies have an effect on social or educational outcomes. In the latter example, we think of the policy intervention as a <em>treatment</em> and follow the same mathematical procedure. The analyses used in <em>A/B testing</em>, widely used today by internet companies, are based on treatment effects models.</p>
+<p>Moreover, these models have been applied in observational studies where analysts attempt to use linear models to estimate effects of interest while accounting for potential confounders. For example, to estimate the effect of a diet high in fruits and vegetables on blood pressure, we would have to adjust for <em>factors</em> such as age, sex, and smoking status.</p>
+<p>In this chapter, we consider an experiment designed to test for the effects of a high-fat diet on mouse physiology. Mice were randomly selected and divided into two groups: one group receiving a high-fat diet, considered the <em>treatment</em>, while the other group served as the control and received the usual <em>chow</em> diet. The data is included in the <strong>dslabs</strong> package:</p>
 <div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-2_b172f9e9bd742b5b830241492a33c2fb">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/table.html">table</a></span><span class="op">(</span><span class="va">mice_weights</span><span class="op">$</span><span class="va">diet</span><span class="op">)</span></span>
@@ -441,9 +455,9 @@ <h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>But, given that we divided the mice at random, is it possible the observed difference is simply due to chance? Here, we can compute the sample average and standard deviation of each group and perform statistical inference on the difference of these means, similar to what we did for election forecasting in Chapters <a href="../inference/hypothesis-testing.html"><span>Chapter&nbsp;9</span></a> and <a href="../inference/models.html"><span>Chapter&nbsp;10</span></a>.</p>
-<section id="comparing-group-means" class="level2" data-number="16.1"><h2 data-number="16.1" class="anchored" data-anchor-id="comparing-group-means">
-<span class="header-section-number">16.1</span> Comparing group means</h2>
+<p>However, given that we divided the mice randomly, is it possible that the observed difference is simply due to chance? Here, we can compute the sample average and standard deviation of each group and perform statistical inference on the difference of these means, similar to our approach for election forecasting in <a href="../inference/hypothesis-testing.html"><span>Chapter&nbsp;9</span></a> and <a href="../inference/models.html"><span>Chapter&nbsp;11</span></a>.</p>
+<section id="comparing-group-means" class="level2" data-number="17.1"><h2 data-number="17.1" class="anchored" data-anchor-id="comparing-group-means">
+<span class="header-section-number">17.1</span> Comparing group means</h2>
 <p>The sample averages for the two groups, high-fat and chow diets, are different:</p>
 <div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-3_71354d877ed3fa06dabb10bae1b01486">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
@@ -454,51 +468,61 @@ <h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-i
 <span><span class="co">#&gt; 1 chow     31.5</span></span>
 <span><span class="co">#&gt; 2 hf       36.7</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>But this is a random sample of mice and the assignment to the diet group is also random. So is this difference due to chance? We will use hypothesis testing, first described in Chapter <a href="../inference/hypothesis-testing.html"><span>Chapter&nbsp;9</span></a>, to answer this question.</p>
-<p>Denote with <span class="math inline">\(\mu_1\)</span> and <span class="math inline">\(\sigma_1\)</span> the weight average and standard deviation we would observe if the entire population of mice were on the high-fat diet. Define <span class="math inline">\(\mu_0\)</span> and <span class="math inline">\(\sigma_0\)</span> similarly for the chow diet. Define <span class="math inline">\(N_1\)</span> and <span class="math inline">\(N_0\)</span> as the sample sizes, let’s call them <span class="math inline">\(\bar{X}_1\)</span> and <span class="math inline">\(\bar{X}_0\)</span> as the sample averages, and <span class="math inline">\(s_1\)</span> and <span class="math inline">\(s_0\)</span> the sample standard deviations for the for the high-fat and chow diets, respectively. Because this is a random sample the central limit theorem tells us that the difference in averages <span class="math inline">\(bar{X}_1 - \bar{X}_0\)</span> follows a normal distribution with expected value <span class="math inline">\(\mu_1-\mu_0\)</span> and standard error <span class="math inline">\(\sqrt{\frac{s_1^2}{N_1} + \frac{s_0^2}{N_0}}\)</span>. If we define the null hypothesis as the high-fat diet having no effect, or <span class="math inline">\(\mu_1 - \mu_0 = 0\)</span>, the the following summary statistic</p>
+<p>However, this is a random sample of mice, and the assignment to the diet group is also done randomly. So is this difference due to chance? We will use hypothesis testing, first described in <a href="../inference/hypothesis-testing.html"><span>Chapter&nbsp;9</span></a>, to answer this question.</p>
+<p>Let <span class="math inline">\(\mu_1\)</span> and <span class="math inline">\(\sigma_1\)</span> represent the weight average and standard deviation, respectively, that we would observe if the entire population of mice were on the high-fat diet. Define <span class="math inline">\(\mu_0\)</span> and <span class="math inline">\(\sigma_0\)</span> similarly, but for the chow diet. Define <span class="math inline">\(N_1\)</span> and <span class="math inline">\(N_0\)</span> as the sample sizes, and <span class="math inline">\(\bar{X}_1\)</span> and <span class="math inline">\(\bar{X}_0\)</span> the sample averages, for the for the high-fat and chow diets, respectively.</p>
+<p>Since the data comes from a random sample, the central limit theorem tells us that, if the sample is large enough, the difference in averages <span class="math inline">\(bar{X}_1 - \bar{X}_0\)</span> follows a normal distribution, with expected value <span class="math inline">\(\mu_1-\mu_0\)</span> and standard error <span class="math inline">\(\sqrt{\frac{\sigma_1^2}{N_1} + \frac{\sigma_0^2}{N_0}}\)</span>.</p>
+<p>If we define the null hypothesis as the high-fat diet having no effect, or <span class="math inline">\(\mu_1 - \mu_0 = 0\)</span>, this implies that</p>
+<p><span class="math display">\[
+\frac{\bar{X}_1 - \bar{X}_0}{\sqrt{\frac{\sigma_1^2}{N_1} + \frac{\sigma_0^2}{N_0}}}
+\]</span></p>
+<p>has expected value 0 and standard error 1 and therefore approximately follows a standard normal distribution.</p>
+<p>Note that we can’t compute this quantity in practice because the <span class="math inline">\(\sigma_1\)</span> and <span class="math inline">\(\sigma_0\)</span> are unknown. However, if we estimate them with the sample standard deviations, denote them <span class="math inline">\(s_1\)</span> and <span class="math inline">\(s_0\)</span> for the high-fat and chow diets, respectively, the central limit still holds and tells us that</p>
 <p><span class="math display">\[
 t = \frac{\bar{X}_1 - \bar{X}_0}{\sqrt{\frac{s_1^2}{N_1} + \frac{s_0^2}{N_0}}}
 \]</span></p>
-<p>follows a standard normal distribution when the null hypothesis is true, which implies we can easily compute the probability of observing a value as large as the one we did:</p>
-<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-4_8d734c5eaf574dcba36ef47787399bac">
-<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">stats</span> <span class="op">&lt;-</span> <span class="va">mice_weights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">diet</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>xbar <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">body_weight</span><span class="op">)</span>, s <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">body_weight</span><span class="op">)</span>, n <span class="op">=</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/context.html">n</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span> </span>
+<p>follows a standard normal distribution when the null hypothesis is true. This implies that we can easily compute the probability of observing a value as large as the one we obtained:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-4_2659465f7d59d53540b8650e1f45f146">
+<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">stats</span> <span class="op">&lt;-</span> <span class="va">mice_weights</span> <span class="op">|&gt;</span> </span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">diet</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>xbar <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">body_weight</span><span class="op">)</span>, s <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">body_weight</span><span class="op">)</span>, n <span class="op">=</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/context.html">n</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span> </span>
 <span><span class="va">t_stat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">stats</span>, <span class="op">(</span><span class="va">xbar</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span> <span class="op">-</span> <span class="va">xbar</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">s</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span><span class="op">^</span><span class="fl">2</span><span class="op">/</span><span class="va">n</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span> <span class="op">+</span> <span class="va">s</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">^</span><span class="fl">2</span><span class="op">/</span><span class="va">n</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">t_stat</span></span>
 <span><span class="co">#&gt; [1] 9.34</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Here <span class="math inline">\(t\)</span> is well over 3, so we don’t really need to compute the p-value <code>1-pnorm(t_stat)</code> as we know it will be very small.</p>
-<p>Note that when <span class="math inline">\(N\)</span> is not large, then the CLT does not apply. However, if the outcome data, in this case weight, follows a normal distribution, then <span class="math inline">\(t\)</span> follows a t-distribution with <span class="math inline">\(N_1+N_2-2\)</span> degrees of freedom. So the calculation of the p-value is the same except we use <code>1-pt(t_stat, with(stats, n[2]+n[1]-2)</code> to compute the p-value.</p>
-<p>Because using differences in mean are so common in scientific studies, this <em>t-statistic</em> is one of the most widely reported summaries. When use it in a hypothesis testing setting, it is referred to a performing a <em>t test</em>.</p>
+<p>Note that when <span class="math inline">\(N\)</span> is not large enough, then the CLT does not apply. However, if the outcome data, in this case weight, follows a normal distribution, then <span class="math inline">\(t\)</span> follows a t-distribution with <span class="math inline">\(N_1+N_2-2\)</span> degrees of freedom. So the calculation of the p-value is the same except that we use <code>pt</code> instead of <code>pnorm</code>. Specifically, we use <code>1-pt(t_stat, with(stats, n[2]+n[1]-2)</code>.</p>
+<p>Differences in means are commonly examined in the scientific studies. As a result this <em>t-statistic</em> is one of the most widely reported summaries. When used to determine if an observed difference is <em>statistically significant</em>, we refer to the procedure as “performing a <em>t test</em>”.</p>
 <div class="callout callout-style-simple callout-warning">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>In the computation above we computed the probability of <code>t</code> being as large as what we observed. However, when we are equally interested in both directions, for example, either an increase or decrease in weight, then we need to compute the probability of <code>t</code> being <em>as extreme</em> as what we observe. The formula simply changes to using the absolute value: <code>1 - pnorm(abs(t-test))</code> or <code>1-pt(t_stat, with(stats, n[2]+n[1]-2)</code>.</p>
+<p>In the computation above, we computed the probability of <code>t</code> being as large as what we observed. However, when our interest spans both directions, for example, either an increase or decrease in weight, we need to compute the probability of <code>t</code> being <em>as extreme</em> as what we observe. The formula simply changes to using the absolute value: <code>1 - pnorm(abs(t-test))</code> or <code>1-pt(abs(t_stat), with(stats, n[2]+n[1]-2)</code>.</p>
 </div>
 </div>
 </div>
-</section><section id="one-factor-design" class="level2" data-number="16.2"><h2 data-number="16.2" class="anchored" data-anchor-id="one-factor-design">
-<span class="header-section-number">16.2</span> One factor design</h2>
-<p>Although the t-test is useful for cases in which we only account for two treatments, it is common to have other variables affect our outcomes. Linear models permit hypothesis testing in more general situations. We start the description of the use linear models for estimating treatment effects by demonstrating how they can be used to perform t-tests.</p>
+</section><section id="one-factor-design" class="level2" data-number="17.2"><h2 data-number="17.2" class="anchored" data-anchor-id="one-factor-design">
+<span class="header-section-number">17.2</span> One factor design</h2>
+<p>Although the t-test is useful for cases in which we compare two treatments, it is common to have other variables affect our outcomes. Linear models permit hypothesis testing in these more general situations. We start the description of the use of linear models for estimating treatment effects by demonstrating how they can be used to perform t-tests.</p>
 <p>If we assume that the weight distributions for both chow and high-fat diets are normally distributed, we can write the following linear model to represent the data:</p>
 <p><span class="math display">\[
 Y_i = \beta_0 + \beta_1 x_i + \varepsilon_i
-\]</span> with <span class="math inline">\(X_i\)</span> 1 if the <span class="math inline">\(i\)</span>-th mice was fed the high-fat diet and 0 otherwise and the errors <span class="math inline">\(\varepsilon_i\)</span> independent and normally distributed with expected value 0 and standard deviation <span class="math inline">\(\sigma\)</span>. Note that this mathematical formula looks exactly like the model we wrote out for the father-son heights. However, the fact that <span class="math inline">\(x_i\)</span> is now 0 or 1 rather than a continuous variable, permits us to use it in this different context. In particular notice that now <span class="math inline">\(\beta_0\)</span> represents the population average height of the mice on the chow diet and <span class="math inline">\(\beta_0 + \beta_1\)</span> represents the population average for the weight of the mice on the high-fat diet.</p>
-<p>A nice feature of this model is that <span class="math inline">\(\beta_1\)</span> represents the <em>treatment effect</em> of receiving the high-fat diet. If the null hypothesis that the high-fat diet has no effect can be quantified as <span class="math inline">\(\beta_1 = 0\)</span>. We can then estimate <span class="math inline">\(\beta_1\)</span> and answer the question of weather or not the observed difference is real by computing the estimates being as large as it was under the null. So how do we estimate <span class="math inline">\(\beta_1\)</span> and a standard error for the estimate?</p>
-<p>A powerful characteristics of linear models is that we can can estimate the parameters <span class="math inline">\(\beta\)</span>s and their standard errors with the same LSE machinery:</p>
+\]</span></p>
+<p>with <span class="math inline">\(X_i\)</span> 1, if the <span class="math inline">\(i\)</span>-th mice was fed the high-fat diet, and 0 otherwise, and the errors <span class="math inline">\(\varepsilon_i\)</span> independent and normally distributed with expected value 0 and standard deviation <span class="math inline">\(\sigma\)</span>. Note that this mathematical formula looks exactly like the model we wrote out for the father-son heights. However, the fact that <span class="math inline">\(x_i\)</span> is now 0 or 1 rather than a continuous variable, allows us to use it in this different context. In particular, notice that now <span class="math inline">\(\beta_0\)</span> represents the population average height of the mice on the chow diet and <span class="math inline">\(\beta_0 + \beta_1\)</span> represents the population average for the weight of the mice on the high-fat diet.</p>
+<p>A nice feature of this model is that <span class="math inline">\(\beta_1\)</span> represents the <em>treatment effect</em> of receiving the high-fat diet. The null hypothesis that the high-fat diet has no effect can be quantified as <span class="math inline">\(\beta_1 = 0\)</span>. To perform hypothesis testing on the effect of the high fat diet we can estimate <span class="math inline">\(\beta_1\)</span> and compute the probability of an estimates being as large as the observed when the null hypothesis is true. So how do we estimate <span class="math inline">\(\beta_1\)</span> and compute this probability?</p>
+<p>A powerful characteristic of linear models is that we can estimate the <span class="math inline">\(\beta\)</span>s and their standard errors with the same LSE machinery:</p>
 <div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-5_34221b5d3852bbefc032fc63b3d0e160">
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">body_weight</span> <span class="op">~</span> <span class="va">diet</span>, data <span class="op">=</span> <span class="va">mice_weights</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Because <code>diet</code> is a factor with two entries, the <code>lm</code> function knows to fit the model above with a <span class="math inline">\(x_i\)</span> a indicator variable. The <code>summary</code> function shows us the resulting estimate, standard error, and p-value:</p>
+<p>Because <code>diet</code> is a factor with two entries, the <code>lm</code> function knows to fit the linear model above with a <span class="math inline">\(x_i\)</span> a indicator variable. The <code>summary</code> function shows us the resulting estimate, standard error, and p-value:</p>
 <div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-6_2629edb077dcfe9895737146a6902195">
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/coef.html">coefficients</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/summary.html">summary</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt;             Estimate Std. Error t value Pr(&gt;|t|)</span></span>
 <span><span class="co">#&gt; (Intercept)    31.54      0.386   81.74 0.00e+00</span></span>
 <span><span class="co">#&gt; diethf          5.14      0.548    9.36 8.02e-20</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>or using <code>broom</code> we can write:</p>
+<p>Using <code>broom</code>, we can write:</p>
 <div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-7_30f9db019a777205f0391432e81e9de6">
 <div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://broom.tidymodels.org/">broom</a></span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://generics.r-lib.org/reference/tidy.html">tidy</a></span><span class="op">(</span><span class="va">fit</span>, conf.int <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">term</span> <span class="op">==</span> <span class="st">"diethf"</span><span class="op">)</span></span>
@@ -507,24 +531,25 @@ <h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-i
 <span><span class="co">#&gt;   &lt;chr&gt;     &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;</span></span>
 <span><span class="co">#&gt; 1 diethf     5.14     0.548      9.36 8.02e-20     4.06      6.21</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The <code>statistic</code> computed here is the estimate divided by its standard error: <span class="math inline">\(\hat{beta}_1 / \hat{\mbox{SE}}(\hat{beta}_1)\)</span>. In the case for the simple one-factor model, we can show that this statistic is almost equivalent to the t-test. Intuitively it makes since both <span class="math inline">\(\hat{\beta_1}\)</span> and the numerator of the t-test are estimates of the treatment effect. In fact, we can see that we obtain a number similar to the <span class="math inline">\(t\)</span> computed in the previous section.</p>
+<p>The <code>statistic</code> computed here is the estimate divided by its standard error: <span class="math inline">\(\hat{\beta}_1 / \hat{\mbox{SE}}(\hat{\beta}_1)\)</span>. In the case of the simple one-factor model, we can show that this statistic is almost equivalent to the t-statistics computed in the previous section:</p>
 <div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-8_742c4b6ff8d5fc40901f0f00134cb1f8">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/coef.html">coefficients</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/summary.html">summary</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span><span class="op">)</span><span class="op">[</span><span class="fl">2</span>,<span class="fl">3</span><span class="op">]</span>, <span class="va">t_stat</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 9.36 9.34</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>One minor difference is that the linear model does not assume a different standard deviation for each population. Instead, both populations share <span class="math inline">\(\mbox{SD}(\varepislon)\)</span> as a standard deviation. Note that, although we do not show how to do it with R here, we can redefine the linear model to have different standard errors for each group.</p>
+<p>Intuitively, it makes sense, as both <span class="math inline">\(\hat{\beta_1}\)</span> and the numerator of the t-test are estimates of the treatment effect.</p>
+<p>The one minor difference is that the linear model does not assume a different standard deviation for each population. Instead, both populations share <span class="math inline">\(\mbox{SD}(\varepsilon)\)</span> as a standard deviation. Note that, although we don’t demonstrate it with R here, we can redefine the linear model to have different standard errors for each group.</p>
 <div class="callout callout-style-simple callout-note">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>In the linear model description provided here we assumed <span class="math inline">\(\varepsilon\)</span> follows a normal distribution. This assumption permits us to show that the statistics formed by dividing estimates by their estimated standard errors follow t-distribution, which in turn permits us to estimate p-values or confidence intervals. However, note that we do not need this assumption to compute the expected value and standard error of the least squared estimates. Furthermore, if the number of observations in large enough, then the central limit theorem applies and we can obtain p-values and confidence intervals even without the normal distribution assumption.</p>
+<p>In the linear model description provided here, we assumed <span class="math inline">\(\varepsilon\)</span> follows a normal distribution. This assumption permits us to show that the statistics formed by dividing estimates by their estimated standard errors follow t-distribution, which in turn allows us to estimate p-values or confidence intervals. However, note that we do not need this assumption to compute the expected value and standard error of the least squared estimates. Furthermore, if the number of observations is large enough, then the central limit theorem applies and we can obtain p-values and confidence intervals even without the normal distribution assumption for the errors.</p>
 </div>
 </div>
 </div>
-</section><section id="two-factor-designs" class="level2" data-number="16.3"><h2 data-number="16.3" class="anchored" data-anchor-id="two-factor-designs">
-<span class="header-section-number">16.3</span> Two factor designs</h2>
+</section><section id="two-factor-designs" class="level2" data-number="17.3"><h2 data-number="17.3" class="anchored" data-anchor-id="two-factor-designs">
+<span class="header-section-number">17.3</span> Two factor designs</h2>
 <p>Note that this experiment included male and female mice, and male mice are known to be heavier. This explains why the residuals depend on the sex variable:</p>
 <div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/lm-residual-boxplots_8131eba9f09d784df31120901b731a6b">
 <div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/boxplot.html">boxplot</a></span><span class="op">(</span><span class="va">fit</span><span class="op">$</span><span class="va">residuals</span> <span class="op">~</span> <span class="va">mice_weights</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -535,7 +560,7 @@ <h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>This misspecification can have real implications since if more male mice received the high-fat diet, then this could explain the increase. Or if less received it, then we might underestimate the diet effect. Sex might be a confounder. Our model can certainly be improved.</p>
+<p>This misspecification can have real implications; for instance, if more male mice received the high-fat diet, then this could explain the increase. Conversely, if fewer received it, we might underestimate the diet effect. Sex could be a confounder, indicating that our model can certainly be improved.</p>
 <p>From examining the data:</p>
 <div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/weight-by-sex-diet-boxplots_907b7eb4c3acdb945516c7a53b53e61b">
 <div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mice_weights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">diet</span>, <span class="fu"><a href="https://rdrr.io/r/base/Log.html">log2</a></span><span class="op">(</span><span class="va">body_weight</span><span class="op">)</span>, fill <span class="op">=</span> <span class="va">sex</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_boxplot.html">geom_boxplot</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -546,16 +571,19 @@ <h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>we see that there diet effect is observed for both sexes and that males are heavier than females. Although not nearly as obvious, it also appears the diet effect is stronger in males. A linear models that permits a different expected value four groups, 1) female on chow diet, 2) females on high-fat diet, 3) male on chow diet, and 4)males on high-fat diet,</p>
+<p>we see that the diet effect is observed for both sexes and that males are heavier than females. Although not nearly as obvious, it also appears the diet effect is stronger in males.</p>
+<p>A linear model that permits a different expected value for the following four groups, 1) female on chow diet, 2) females on high-fat diet, 3) male on chow diet, and 4) males on high-fat diet, can be written like this:</p>
 <p><span class="math display">\[
 Y_i = \beta_1 x_{i,1} + \beta_2 x_{i,2}  + \beta_3 x_{i,3}  + \beta_4 x_{i,4}  + \varepsilon_i
-\]</span> with the <span class="math inline">\(x_i\)</span>s indicator variables for each of the four groups. However, with this representation, none of the <span class="math inline">\(\beta\)</span>s represent the effect of interest: the diet effect. Furthermore, we now are accounting for the possibility that the diet effect is different for males and females have a different, and can test that hypothesis as well.</p>
-<p>A powerful feature of linear models is that we can rewrite the model so that we still have a different expected value for each group, but the parameters represent effects we are interested. So, for example, in the representation</p>
+\]</span></p>
+<p>with <span class="math inline">\(x_{i,1},\dots,x_{i,4}\)</span> indicator variables for each of the four groups. Note that with this representation we allow the diet effect to be different for males and females.</p>
+<p>However, with this representation, none of the <span class="math inline">\(\beta\)</span>s represent the effect of interest: the diet effect. A powerful feature of linear models is that we can rewrite the model so that the expected value for each group remains the same, but the parameters represent the effects we are interested in. So, for example, in the representation</p>
 <p><span class="math display">\[
 Y_i = \beta_0 + \beta_1 x_{i,1}  + \beta_2 x_{i,2}  + \beta_3 x_{i,1} x_{i,2}  + \varepsilon_i
-\]</span> with <span class="math inline">\(x_{i,1}\)</span> and indicator that is one if you have the treatment and <span class="math inline">\(x_{i,2}\)</span> an indicator that is one if you are male, the <span class="math inline">\(\beta_1\)</span> can be interpreted as the treatment effect for females, <span class="math inline">\(\beta_2\)</span> as the difference between males and females, and <span class="math inline">\(\beta_3\)</span> the added treatment effect for males. In Statistics, this last effect is referred to as an <em>interaction</em> effect. The <span class="math inline">\(\beta_0\)</span> is consider the baseline value which is the average weight of females on the chow diet.</p>
-<p>Statistical textbooks describes several other ways in which the model can be rewritten to obtain other types of interpretations. For example, we might want <span class="math inline">\(\beta_2\)</span> to represent the average treatment effect between females and males, rather that the female treatment effects. This is achieved by defining what <em>contrasts</em> we are interested.</p>
-<p>In R we can specific this model using the following</p>
+\]</span></p>
+<p>with <span class="math inline">\(x_{i,1}\)</span> an indicator that is 1 if individual <span class="math inline">\(i\)</span> is on the high-fat diet <span class="math inline">\(x_{i,2}\)</span> an indicator that is 1 if you are male, the <span class="math inline">\(\beta_1\)</span> is interpreted as the diet effect for females, <span class="math inline">\(\beta_2\)</span> as the average difference between males and females, and <span class="math inline">\(\beta_3\)</span> the difference in the diet effect between males and females. In statistics, <span class="math inline">\(\beta_3\)</span> is referred to as an <em>interaction effect</em>. The <span class="math inline">\(\beta_0\)</span> is considered the baseline value, which is the average weight of females on the chow diet.</p>
+<p>Statistical textbooks describe several other ways in which the model can be rewritten to obtain other types of interpretations. For example, we might want <span class="math inline">\(\beta_2\)</span> to represent the overall diet effect (the average between female and male effect) rather than the diet effect on females. This is achieved by defining what <em>contrasts</em> we are interested in.</p>
+<p>In R, we can specific the linear model above using the following:</p>
 <div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-9_a99e275813bf1fecb2ec3fe44c05fc3a">
 <div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">body_weight</span> <span class="op">~</span> <span class="va">diet</span><span class="op">*</span><span class="va">sex</span>, data <span class="op">=</span> <span class="va">mice_weights</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
@@ -569,16 +597,16 @@ <h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-i
 <span><span class="co">#&gt; 2 sexM            7.53     0.627     12.0  1.27e-30    6.30       8.76</span></span>
 <span><span class="co">#&gt; 3 diethf:sexM     2.66     0.891      2.99 2.91e- 3    0.912      4.41</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Note that the male effect is larger that the diet effect, and the diet effect is statistically significant for both sexes, with the males having a higher effect by between 1 and 4.5 grams.</p>
-<p>A common approach applied when more than one factor is thought to affect the measurement is to simply include an additive effect for each factor like this:</p>
+<p>Note that the male effect is larger that the diet effect, and the diet effect is statistically significant for both sexes, with diet affecting males more by between 1 and 4.5 grams.</p>
+<p>A common approach applied when more than one factor is thought to affect the measurement is to simply include an additive effect for each factor, like this:</p>
 <p><span class="math display">\[
 Y_i = \beta_0 + \beta_1 x_{i,1}  + \beta_2 x_{i,2}   + \varepsilon_i
 \]</span></p>
-<p>In this model, the <span class="math inline">\(\beta_1\)</span> is a general diet effect that applies regardless of sex. In R we use the following code using a <code>+</code> instead of <code>*</code>:</p>
+<p>In this model, the <span class="math inline">\(\beta_1\)</span> is a general diet effect that applies regardless of sex. In R, we use the following code, employing a <code>+</code> instead of <code>*</code>:</p>
 <div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-11_2aa2bc1fec02f173e190f8fc6112b11c">
 <div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">body_weight</span> <span class="op">~</span> <span class="va">diet</span> <span class="op">+</span> <span class="va">sex</span>, data <span class="op">=</span> <span class="va">mice_weights</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Because their a strong interaction effect, a diagnostic plots shows that the residuals are biased: the average negative for females on the diet and positive for the males on the diet, rather than 0.</p>
+<p>Note that this model does not account for the difference in diet effect between males and females. Diagnostic plots would reveal this deficiency by showing that the residuals are biased: they are, on average, negative for females on the diet and positive for males on the diet, rather than being centered around 0.</p>
 <div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/lm-diagnostic-plot_e18c33410dacff6d1230ab803d494bd8">
 <div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">fit</span>, which <span class="op">=</span> <span class="fl">1</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
@@ -588,101 +616,206 @@ <h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>Scientific studies, particularly within epidemiology and social sciences, frequently omit interaction terms from models due to the high number of variables. Adding interactions necessitates numerous parameters, which, in extreme cases, may prevent the model from fitting. However, this approach assumes that the interaction terms are zero, which, if incorrect, can skew the results’ interpretation. Conversely, when this assumption is valid, models excluding interactions are simpler to interpret as parameters are typically viewed as the extent to which the outcome increases with the assigned treatment.</p>
+<p>Scientific studies, particularly within epidemiology and social sciences, frequently omit interaction terms from models due to the high number of variables. Adding interactions necessitates numerous parameters, which in extreme cases may prevent the model from fitting. However, this approach assumes that the interaction terms are zero, and if incorrect, it can skew the interpretation of the results. Conversely, when this assumption is valid, models excluding interactions are simpler to interpret, as parameters are typically viewed as the extent to which the outcome increases with the assigned treatment.</p>
 <div class="callout callout-style-simple callout-tip">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>Linear models are very flexible and can be applied in many contexts. For example, we can include many more factors than 2. We have just scratched the surface of how linear models can be used to estimate treatment effects. We highly recommend learning more about this through linear model textbooks and R manuals on using the <code>lm</code>, <code>contrasts</code>, and <code>model.matrix</code> functions.</p>
+<p>Linear models are highly flexible and applicable in many contexts. For example, we can include many more factors than just 2. We have only just scratched the surface of how linear models can be used to estimate treatment effects. We highly recommend learning more about this by exploring linear model textbooks and R manuals that cover the use of functions such as <code>lm</code>, <code>contrasts</code>, and <code>model.matrix</code>.</p>
 </div>
 </div>
 </div>
-</section><section id="analysis-of-variance" class="level2" data-number="16.4"><h2 data-number="16.4" class="anchored" data-anchor-id="analysis-of-variance">
-<span class="header-section-number">16.4</span> Analysis of variance</h2>
-<p>In the example we have examined, each treatment had only two <em>levels</em>: diet and chow and high-fat and sex had female and male. However, often we have variables of interest that have more than one level. For example, we might have tested a third diet on the mice. In statistics textbooks these variables are referred to as <em>factor</em>. In these cases it is common to want to know rather than the effect of each levels of the factor, a more general quantification regarding the variability across the levels. Analysis of variances or ANOVA does just this. The summary used to quantify the variability of a factor is the mean squared error of the estimated effects of each level.</p>
+</section><section id="contrasts" class="level2" data-number="17.4"><h2 data-number="17.4" class="anchored" data-anchor-id="contrasts">
+<span class="header-section-number">17.4</span> Contrasts</h2>
+<p>In the examples we have examined, each treatment had only two groups: diet had chow/high-fat, and sex had female/male. However, variables of interest often have more than one level. For example, we might have tested a third diet on the mice. In statistics textbooks, these variables are referred to as a <em>factor</em>, and the groups in each factor are called its <em>levels</em>.</p>
+<p>When a factor is included in the formula, the default behavior for <code>lm</code> is to define the intercept term as the expected value for the first level, and the other coefficient are to represent the difference, or <em>contrast</em>, between the other levels and first. We can see when we estimate the sex effect with <code>lm</code> like this:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-12_bb0aaef709c95401fc094fcc86c01b07">
+<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">body_weight</span> <span class="op">~</span> <span class="va">sex</span>, data <span class="op">=</span> <span class="va">mice_weights</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/stats/coef.html">coefficients</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span></span>
+<span><span class="co">#&gt; (Intercept)        sexM </span></span>
+<span><span class="co">#&gt;       29.76        8.82</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>To recover the expected mean for males, we can simply add the two coefficients:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-13_c1ebf3e2e42b8304227ae99041d4c52f">
+<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">fit</span><span class="op">$</span><span class="va">coefficients</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">2</span><span class="op">]</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] 38.6</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>The package <strong>emmeans</strong> simplifies the calculation and also calculates standard errors:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-14_0100728b9937bf89d8a0de0ff0018ba0">
+<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/rvlenth/emmeans">emmeans</a></span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/pkg/emmeans/man/emmeans.html">emmeans</a></span><span class="op">(</span><span class="va">fit</span>, <span class="op">~</span><span class="va">sex</span><span class="op">)</span></span>
+<span><span class="co">#&gt;  sex emmean    SE  df lower.CL upper.CL</span></span>
+<span><span class="co">#&gt;  F     29.8 0.339 778     29.1     30.4</span></span>
+<span><span class="co">#&gt;  M     38.6 0.346 778     37.9     39.3</span></span>
+<span><span class="co">#&gt; </span></span>
+<span><span class="co">#&gt; Confidence level used: 0.95</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Now, what if we really didn’t want to define a reference level? What if we wanted a parameter to represent the difference from each group to the overall mean? Can we write a model like this:</p>
+<p><span class="math display">\[
+Y_i = \beta_0 + \beta_1 x_{i,1} + \beta_2 x_{i,2} + \varepsilon_i
+\]</span> with <span class="math inline">\(x_{i,1} = 1\)</span>, if observation <span class="math inline">\(i\)</span> is female and 0 otherwise, and <span class="math inline">\(x_{i,2}=1\)</span>, if observation <span class="math inline">\(i\)</span> is male and 0 otherwise?</p>
+<p>Unfortunately, this representation has a problem. Note that the mean for females and males are represented by <span class="math inline">\(\beta_0 + \beta_1\)</span> and <span class="math inline">\(\beta_0 + \beta_2\)</span>, respectively. This is a problem because the expected value for each group is just one number, say <span class="math inline">\(\mu_f\)</span> and <span class="math inline">\(\mu_m\)</span>, and there is an infinite number of ways <span class="math inline">\(\beta_0 + \beta_1 = \mu_f\)</span> and <span class="math inline">\(\beta_0 +\beta_2 = \mu_m\)</span> (three unknowns with two equations). This implies that we can’t obtain a unique least squares estimates. In statistics, we say the model, or parameters, are <em>unidentifiable</em>. The default behavior in R solves this problem by requiring <span class="math inline">\(\beta_1 = 0\)</span>, forcing <span class="math inline">\(\beta_0 = \mu_m\)</span>, which permits us to solve the system of equations.</p>
+<p>Keep in mind that this is not the only constraint that permits estimation of the parameters. Any linear constraint will do as it adds a third equation to our system. A widely used constraint is to require <span class="math inline">\(\beta_1 + \beta_2 = 0\)</span>. To achieve this in R, we can use the argument <code>contrast</code> in the following way:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-15_87fb693dd3d1386f08ac67790ceb9283">
+<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">body_weight</span> <span class="op">~</span> <span class="va">sex</span>, data <span class="op">=</span> <span class="va">mice_weights</span>, </span>
+<span>          contrasts <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html">list</a></span><span class="op">(</span>sex <span class="op">=</span> <span class="va">contr.sum</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/stats/coef.html">coefficients</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span></span>
+<span><span class="co">#&gt; (Intercept)        sex1 </span></span>
+<span><span class="co">#&gt;       34.17       -4.41</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>We see that the intercept is now larger, reflecting the overall mean rather than just the mean for females. The other coefficient, <span class="math inline">\(\beta_1\)</span>, represents the contrast between females and the overall mean in our model. The coefficient for men is not shown because it is redundant: <span class="math inline">\(\beta_1= -\beta_2\)</span>.</p>
+<p>If we want to see all the estimates, the <strong>emmeans</strong> package also makes the calculations for us:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-16_3e07836bc638d103d756aa5a7089b045">
+<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/emmeans/man/contrast.html">contrast</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/pkg/emmeans/man/emmeans.html">emmeans</a></span><span class="op">(</span><span class="va">fit</span>, <span class="op">~</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="co">#&gt;  contrast estimate    SE  df t.ratio p.value</span></span>
+<span><span class="co">#&gt;  F effect    -4.41 0.242 778 -18.200  &lt;.0001</span></span>
+<span><span class="co">#&gt;  M effect     4.41 0.242 778  18.200  &lt;.0001</span></span>
+<span><span class="co">#&gt; </span></span>
+<span><span class="co">#&gt; P value adjustment: fdr method for 2 tests</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>The use of this alternative constraint is more practical when a factor has more than one level, and choosing a baseline becomes less convenient. Furthermore, we might be more interested in the variance of the coefficients rather than the contrasts between groups and the reference level.</p>
 <p>As an example, consider that the mice in our dataset are actually from several generations:</p>
-<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-12_954e266df6a974f08baef16bbf7378b1">
-<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/table.html">table</a></span><span class="op">(</span><span class="va">mice_weights</span><span class="op">$</span><span class="va">gen</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-17_a1bc82bb296010f3d0a0402f5dd0d3b7">
+<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/table.html">table</a></span><span class="op">(</span><span class="va">mice_weights</span><span class="op">$</span><span class="va">gen</span><span class="op">)</span></span>
 <span><span class="co">#&gt; </span></span>
 <span><span class="co">#&gt;   4   7   8   9  11 </span></span>
 <span><span class="co">#&gt;  97 195 193  97 198</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can fit a linear model that fits an effect for each of these generations along with diet and sex model previously fit:</p>
-<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-13_e1efa07c226ad93bfd2853ba0b827ac7">
-<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">body_weight</span> <span class="op">~</span> <span class="va">diet</span> <span class="op">*</span> <span class="va">sex</span> <span class="op">+</span> <span class="va">gen</span>,  data <span class="op">=</span> <span class="va">mice_weights</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>To estimate the variability due to the different generations, a convenient model is:</p>
+<p><span class="math display">\[
+Y_i = \beta_0 + \sum_{j=1}^J \beta_j x_{i,j} + \varepsilon_i
+\]</span></p>
+<p>with <span class="math inline">\(x_{i,j}\)</span> indicator variables: <span class="math inline">\(x_{i,j}=1\)</span> if mouse <span class="math inline">\(i\)</span> is in level <span class="math inline">\(j\)</span> and 0 otherwise, <span class="math inline">\(J\)</span> representing the number of levels, in our example 5 generations, and the level effects constrained with</p>
+<p><span class="math display">\[
+\frac{1}{J} \sum_{j=1}^J \beta_j = 0 \implies \sum_{j=1}^J \beta_j = 0.
+\]</span></p>
+<p>This constraint makes the model identifiable and also allows us to quantify the variability due to generations with:</p>
+<p><span class="math display">\[
+\sigma^2_{\text{gen}} \equiv \frac{1}{J}\sum_{j=1}^J \beta_j^2
+\]</span></p>
+<p>We can see the estimated coefficients using the following:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-18_3f9be1912ad25f4df716f24ada6290c5">
+<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">body_weight</span> <span class="op">~</span> <span class="va">gen</span>,  data <span class="op">=</span> <span class="va">mice_weights</span>, </span>
+<span>          contrasts <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html">list</a></span><span class="op">(</span>gen <span class="op">=</span> <span class="va">contr.sum</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/pkg/emmeans/man/contrast.html">contrast</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/pkg/emmeans/man/emmeans.html">emmeans</a></span><span class="op">(</span><span class="va">fit</span>, <span class="op">~</span><span class="va">gen</span><span class="op">)</span><span class="op">)</span> </span>
+<span><span class="co">#&gt;  contrast     estimate    SE  df t.ratio p.value</span></span>
+<span><span class="co">#&gt;  gen4 effect    -0.122 0.705 775  -0.174  0.8620</span></span>
+<span><span class="co">#&gt;  gen7 effect    -0.812 0.542 775  -1.497  0.3370</span></span>
+<span><span class="co">#&gt;  gen8 effect    -0.113 0.544 775  -0.207  0.8620</span></span>
+<span><span class="co">#&gt;  gen9 effect     0.149 0.705 775   0.212  0.8620</span></span>
+<span><span class="co">#&gt;  gen11 effect    0.897 0.540 775   1.663  0.3370</span></span>
+<span><span class="co">#&gt; </span></span>
+<span><span class="co">#&gt; P value adjustment: fdr method for 5 tests</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>In the next section, we briefly describe a technique useful to study the variability associated with this factor.</p>
+</section><section id="sec-anova" class="level2" data-number="17.5"><h2 data-number="17.5" class="anchored" data-anchor-id="sec-anova">
+<span class="header-section-number">17.5</span> Analysis of variance (ANOVA)</h2>
+<p>When a factor has more than one level, it is common to want to determine if there is significant variability across the levels rather than specific difference between any given pair of levels. Analysis of variances (ANOVA) provides tools to do this.</p>
+<p>ANOVA provides an estimate of <span class="math inline">\(\sigma^2_{\text{gen}}\)</span> and a statistical test for the null hypothesis that the factor contributes no variability: <span class="math inline">\(\sigma^2_{\text{gen}} =0\)</span>.</p>
+<p>Once a linear model is fit using one or more factors, the <code>aov</code> function can be used to perform ANOVA. Specifically, the estimate of the factor variability is computed along with a statistic that can be used for hypothesis testing:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-19_76da3256fbd50dcb9d1fd0d932c28c63">
+<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/summary.html">summary</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/aov.html">aov</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="co">#&gt;              Df Sum Sq Mean Sq F value Pr(&gt;F)</span></span>
+<span><span class="co">#&gt; gen           4    294    73.5    1.13   0.34</span></span>
+<span><span class="co">#&gt; Residuals   775  50479    65.1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Keep in mind that if given a model formula, <code>aov</code> will fit the model:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-20_638981e181b408df330f2f5a47fb38bb">
+<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/summary.html">summary</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/aov.html">aov</a></span><span class="op">(</span><span class="va">body_weight</span> <span class="op">~</span> <span class="va">gen</span>, data <span class="op">=</span> <span class="va">mice_weights</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>We do not need to specify the constraint because ANOVA needs to constrain the sum to be 0 for the results to be interpretable.</p>
+<p>This analysis indicates that generation is not statistically significant.</p>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
 </div>
-<p>We can then perform an analysis of variance with the R <code>aov</code> function:</p>
-<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-14_171926721032f3979fe867851a2550ac">
-<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/summary.html">summary</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/aov.html">aov</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span><span class="op">)</span></span>
+<div class="callout-body-container">
+<p>We do not include many details, for example, on how the summary statistics and p-values shown by <code>aov</code> are defined and motivated. There are several books dedicated to the analysis of variance, and textbooks on linear models often include chapters on this topic. Those interested in learning more about these topics can consult one of these textbooks.</p>
+</div>
+</div>
+</div>
+<section id="multiple-factors" class="level3" data-number="17.5.1"><h3 data-number="17.5.1" class="anchored" data-anchor-id="multiple-factors">
+<span class="header-section-number">17.5.1</span> Multiple factors</h3>
+<p>ANOVA was developed to analyze agricultural data, which typically included several factors such as fertilizers, blocks of lands, and plant breeds.</p>
+<p>Note that we can perform ANOVA with multiple factors:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-21_315c963cb8c11cc571ef2e88b46d5834">
+<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/summary.html">summary</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/aov.html">aov</a></span><span class="op">(</span><span class="va">body_weight</span> <span class="op">~</span> <span class="va">sex</span> <span class="op">+</span> <span class="va">diet</span> <span class="op">+</span> <span class="va">gen</span>,  data <span class="op">=</span> <span class="va">mice_weights</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt;              Df Sum Sq Mean Sq F value Pr(&gt;F)    </span></span>
-<span><span class="co">#&gt; diet          1   5143    5143  133.58 &lt;2e-16 ***</span></span>
-<span><span class="co">#&gt; sex           1  15260   15260  396.33 &lt;2e-16 ***</span></span>
-<span><span class="co">#&gt; gen           4    295      74    1.91 0.1061    </span></span>
-<span><span class="co">#&gt; diet:sex      1    349     349    9.06 0.0027 ** </span></span>
-<span><span class="co">#&gt; Residuals   772  29725      39                   </span></span>
+<span><span class="co">#&gt; sex           1  15165   15165  389.80 &lt;2e-16 ***</span></span>
+<span><span class="co">#&gt; diet          1   5238    5238  134.64 &lt;2e-16 ***</span></span>
+<span><span class="co">#&gt; gen           4    295      74    1.89   0.11    </span></span>
+<span><span class="co">#&gt; Residuals   773  30074      39                   </span></span>
 <span><span class="co">#&gt; ---</span></span>
 <span><span class="co">#&gt; Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This analysis shows that the largest variation is explained by sex and then diet. The generation factor explains very little variation in comparison and is not found to be statistically significant.</p>
-<div class="callout callout-style-simple callout-note">
+<p>This analysis suggests that sex is the biggest source of variability, which is consistent with previously made exploratory plots.</p>
+<div class="callout callout-style-simple callout-warning">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>In this book, we do not provide the details for how we compute this p-value. There are several books on analysis of variance and textbooks on linear models often include chapters on this topic. Those interested in learning more about these topics can consult these textbooks.</p>
+<p>One of the key aspects of ANOVA (Analysis of Variance) is its ability to decompose the total variance in the data, represented by <span class="math inline">\(\sum_{i=1}^n Y_i^2\)</span>, into individual contributions attributable to each factor in the study. However, for the mathematical underpinnings of ANOVA to be valid, the experimental design must be balanced. This means that for every level of any given factor, there must be an equal representation of the levels of all other factors. In our study involving mice, the design is unbalanced, requiring a cautious approach in the interpretation of the ANOVA results.</p>
 </div>
 </div>
 </div>
-</section><section id="exercises" class="level2" data-number="16.5"><h2 data-number="16.5" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">16.5</span> Exercises</h2>
-<p>1. Once you fit a model, the estimate of the standard error <span class="math inline">\(\sigma\)</span> can be obtained like this:</p>
-<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-15_f8617a335f25534a43e55bfac2d5ca47">
-<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">body_weight</span> <span class="op">~</span> <span class="va">diet</span>, data <span class="op">=</span> <span class="va">mice_weights</span><span class="op">)</span></span>
+</section><section id="array-representation" class="level3" data-number="17.5.2"><h3 data-number="17.5.2" class="anchored" data-anchor-id="array-representation">
+<span class="header-section-number">17.5.2</span> Array representation</h3>
+<p>When the model includes more than one factor, writing down linear models can become cumbersome. For example, in our two factor model, we would have to include indicator variables for both factors:</p>
+<p><span class="math display">\[
+Y_i = \beta_0 + \sum_{j=1}^J \beta_j x_{i,j} + \sum_{k=1}^K \beta_{J+k} x_{i,J+k} + \varepsilon_i \mbox{ with }\sum_{j=1}^J \beta_j=0 \mbox{ and } \sum_{k=1}^K \beta_{J+k} = 0,
+\]</span></p>
+<p>the <span class="math inline">\(x_{i,1},\dots,x_{i,J}\)</span> indicator functions for the <span class="math inline">\(J\)</span> levels in the first factor and <span class="math inline">\(x_{i,J+1},\dots,x_{i,J+K}\)</span> indicator functions for the <span class="math inline">\(K\)</span> levels in the second factor.</p>
+<p>An alternative approach widely used in ANOVA to avoid indicators variables, is to save the data in an array, using different Greek letters to denote factors and indices to denote levels:</p>
+<p><span class="math display">\[
+Y_{i,j,k} = \mu + \alpha_j + \beta_k + \varepsilon_{i,j,k}
+\]</span></p>
+<p>with <span class="math inline">\(\mu\)</span> the overall mean, <span class="math inline">\(\alpha_j\)</span> the effect of level <span class="math inline">\(j\)</span> in the first factor, and <span class="math inline">\(\beta_k\)</span> the effect of level <span class="math inline">\(k\)</span> in the second factor. The constraint can now be written as:</p>
+<p><span class="math display">\[
+\sum_{j=1}^J \alpha_j = 0 \text{ and } \sum_{k=1}^K \beta_k = 0
+\]</span></p>
+<p>This notation lends itself to estimating the effects by computing means across dimensions of the array.</p>
+</section></section><section id="exercises" class="level2" data-number="17.6"><h2 data-number="17.6" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">17.6</span> Exercises</h2>
+<p>1. Once you fit a model, the estimate of the standard error <span class="math inline">\(\sigma\)</span> can be obtained as follows:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-22_8afc4f32f62fac8aac074fe66a2f79a7">
+<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">body_weight</span> <span class="op">~</span> <span class="va">diet</span>, data <span class="op">=</span> <span class="va">mice_weights</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/summary.html">summary</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span><span class="op">$</span><span class="va">sigma</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Compute the estimate of <span class="math inline">\(\sigma\)</span> using the model that includes just diet and a model that accounts for sex. Are the estimates the same? If not, why not?</p>
-<p>2. One of the assumption of the linear model fit by <code>lm</code> is that the standard deviation of the errors <span class="math inline">\(\varepsilon_i\)</span> is equal for all <span class="math inline">\(i\)</span>. This implies it does not depend on the expected value. Group the mice by their weight like this:</p>
-<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-16_13698cd456f0ce9fc2111d930027c40c">
-<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">breaks</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">mice_weights</span>, <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">min</a></span><span class="op">(</span><span class="va">body_weight</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="va">body_weight</span><span class="op">)</span>, <span class="fl">1</span><span class="op">)</span><span class="op">)</span></span>
+<p>Compute the estimate of <span class="math inline">\(\sigma\)</span> using both the model that includes only diet and a model that accounts for sex. Are the estimates the same? If not, why not?</p>
+<p>2. One of the assumption of the linear model fit by <code>lm</code> is that the standard deviation of the errors <span class="math inline">\(\varepsilon_i\)</span> is equal for all <span class="math inline">\(i\)</span>. This implies that it does not depend on the expected value. Group the mice by their weight like this:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-23_e15f0821fb1b8e341d8d07ef2b7e2ca5">
+<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">breaks</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">mice_weights</span>, <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">min</a></span><span class="op">(</span><span class="va">body_weight</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="va">body_weight</span><span class="op">)</span>, <span class="fl">1</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">dat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span><span class="va">mice_weights</span>, group <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/cut.html">cut</a></span><span class="op">(</span><span class="va">body_weight</span>, <span class="va">breaks</span>, include_lowest <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Compute the average and standard deviation for groups having more than 10 observations and use data exploration to see if this assumption holds?</p>
+<p>Compute the average and standard deviation for groups with more than 10 observations and use data exploration to verify if this assumption holds.</p>
 <p>3. The dataset also includes a variable indicating which litter the mice came from. Create a boxplot showing weights by litter. Use faceting to make separate plots for each diet and sex combination.</p>
-<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-17_af159c4f2c7aec76778890580b033a40">
-<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mice_weights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">litter</span>, <span class="va">body_weight</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_boxplot.html">geom_boxplot</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/facet_grid.html">facet_grid</a></span><span class="op">(</span><span class="va">sex</span><span class="op">~</span><span class="va">diet</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="treatment-effect-models_files/figure-html/unnamed-chunk-17-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>4. Use a linear model to test for a litter effect. Account for sex and diet. Use ANOVA to compare the variability explained by litter to other factors.</p>
-<p>5. The <code>mouse_weights</code> data includes two other outcomes: bone density and percent fat. Make a boxplot showing bone density by sex and diet. Compare what the visualizations shows for the diet effect by sex.</p>
+<p>4. Use a linear model to test for a litter effect, taking into account sex and diet. Use ANOVA to compare the variability explained by litter with that of other factors.</p>
+<p>5. The <code>mouse_weights</code> data includes two other outcomes: bone density and percent fat. Create a boxplot illustrating bone density by sex and diet. Compare what the visualizations reveal about the diet effect by sex.</p>
 <ol start="6" type="1">
-<li>Fit a linear model and test for the diet effect on bone density separately for each sex. Note that the diet effect is statistically significant for females but not for males. Then fit the model to the entire dataset that includes diet, sex and their interaction. Note that the diet effect is significant, yet the interaction effect is not. Explain how this can happen? Hint: To fit a model to the entire dataset that fit a separate effect for males and females you can use the formula <code>~ sex + diet:sex</code>
+<li>Fit a linear model and conduct a separate test for the diet effect on bone density for each sex. Note that the diet effect is statistically significant for females but not for males. Then fit the model to the entire dataset that includes diet, sex and their interaction. Notice that the diet effect is significant, yet the interaction effect is not. Explain how this can happen. Hint: To fit a model to the entire dataset with a separate effect for males and females, you can use the formula <code>~ sex + diet:sex</code>
 </li>
 </ol>
-<p>7. In Chapter <a href="../inference/models.html"><span>Chapter&nbsp;10</span></a>, we talked about pollster bias. We used visualization to motivate the presence of such bias. Here we will give it a more rigorous treatment. We will consider two pollsters that conducted daily polls. We will look at national polls for the month before the election.</p>
-<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-18_a578c862852ab56a8435a89df2370c3e">
-<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls</span> <span class="op">&lt;-</span> <span class="va">polls_us_election_2016</span> <span class="op">|&gt;</span> </span>
+<p>7. In <a href="../inference/models.html"><span>Chapter&nbsp;11</span></a>, we talked about pollster bias and used visualization to motivate the presence of such bias. Here we will give it a more rigorous treatment. We will consider two pollsters that conducted daily polls. We will look at national polls for the month before the election:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-24_6ede2c86942afcd2fccce162842917c4">
+<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls</span> <span class="op">&lt;-</span> <span class="va">polls_us_election_2016</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">pollster</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Rasmussen Reports/Pulse Opinion Research"</span>,</span>
 <span>                         <span class="st">"The Times-Picayune/Lucid"</span><span class="op">)</span> <span class="op">&amp;</span></span>
 <span>           <span class="va">enddate</span> <span class="op">&gt;=</span> <span class="st">"2016-10-15"</span> <span class="op">&amp;</span></span>
 <span>           <span class="va">state</span> <span class="op">==</span> <span class="st">"U.S."</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>spread <span class="op">=</span> <span class="va">rawpoll_clinton</span><span class="op">/</span><span class="fl">100</span> <span class="op">-</span> <span class="va">rawpoll_trump</span><span class="op">/</span><span class="fl">100</span><span class="op">)</span> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We want to answer the question: is there a poll bias? Make a plot showing the spreads for each poll.</p>
+<p>We want to answer the question: is there a pollster bias? Make a plot showing the spreads for each pollster.</p>
 <p>8. The data does seem to suggest there is a difference. However, these data are subject to variability. Perhaps the differences we observe are due to chance.</p>
 <p>The urn model theory says nothing about pollster effect. Under the urn model, both pollsters have the same expected value: the election day difference, that we call <span class="math inline">\(\mu\)</span>.</p>
-<p>To answer the question “is there an urn model?”, we will model the observed data <span class="math inline">\(Y_{i,j}\)</span> in the following way:</p>
+<p>To answer the question “is there an urn model?” we will model the observed data <span class="math inline">\(Y_{i,j}\)</span> in the following way:</p>
 <p><span class="math display">\[
 Y_{i,j} = \mu + b_i + \varepsilon_{i,j}
 \]</span></p>
-<p>with <span class="math inline">\(i=1,2\)</span> indexing the two pollsters, <span class="math inline">\(b_i\)</span> the bias for pollster <span class="math inline">\(i\)</span> and <span class="math inline">\(\varepsilon_ij\)</span> poll to poll chance variability. We assume the <span class="math inline">\(\varepsilon\)</span> are independent from each other, have expected value <span class="math inline">\(0\)</span> and standard deviation <span class="math inline">\(\sigma_i\)</span> regardless of <span class="math inline">\(j\)</span>.</p>
+<p>with <span class="math inline">\(i=1,2\)</span> indexing the two pollsters, <span class="math inline">\(b_i\)</span> the bias for pollster <span class="math inline">\(i\)</span>, and <span class="math inline">\(\varepsilon_ij\)</span> poll to poll chance variability. We assume the <span class="math inline">\(\varepsilon\)</span> are independent from each other, have expected value <span class="math inline">\(0\)</span>, and standard deviation <span class="math inline">\(\sigma_i\)</span> regardless of <span class="math inline">\(j\)</span>.</p>
 <p>Which of the following best represents our question?</p>
 <ol type="a">
 <li>Is <span class="math inline">\(\varepsilon_{i,j}\)</span> = 0?</li>
@@ -690,16 +823,16 @@ <h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-i
 <li>Is <span class="math inline">\(b_1 \neq b_2\)</span>?</li>
 <li>Are <span class="math inline">\(b_1 = 0\)</span> and <span class="math inline">\(b_2 = 0\)</span> ?</li>
 </ol>
-<p>9. In the right side of this model only <span class="math inline">\(\varepsilon_{i,j}\)</span> is a random variable. The other two are constants. What is the expected value of <span class="math inline">\(Y_{1,j}\)</span>?</p>
-<p>10. Suppose we define <span class="math inline">\(\bar{Y}_1\)</span> as the average of poll results from the first poll, <span class="math inline">\(Y_{1,1},\dots,Y_{1,N_1}\)</span> with <span class="math inline">\(N_1\)</span> the number of polls conducted by the first pollster:</p>
-<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-19_01ec8448504b8a72bcc536c0291ee35b">
-<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls</span> <span class="op">|&gt;</span> </span>
+<p>9. On the right side of this model, only <span class="math inline">\(\varepsilon_{i,j}\)</span> is a random variable; the other two are constants. What is the expected value of <span class="math inline">\(Y_{1,j}\)</span>?</p>
+<p>10. Suppose we define <span class="math inline">\(\bar{Y}_1\)</span> as the average of poll results from the first poll, <span class="math inline">\(Y_{1,1},\dots,Y_{1,N_1}\)</span>, where <span class="math inline">\(N_1\)</span> is the number of polls conducted by the first pollster:</p>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-25_e84b18e52679c043f6405cc5be8f4c53">
+<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">pollster</span><span class="op">==</span><span class="st">"Rasmussen Reports/Pulse Opinion Research"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>N_1 <span class="op">=</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/context.html">n</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>What is the expected values <span class="math inline">\(\bar{Y}_1\)</span>?</p>
 <p>11. What is the standard error of <span class="math inline">\(\bar{Y}_1\)</span> ?</p>
-<p>12. Suppose we define <span class="math inline">\(\bar{Y}_2\)</span> as the average of poll results from the first poll, <span class="math inline">\(Y_{2,1},\dots,Y_{2,N_2}\)</span> with <span class="math inline">\(N_2\)</span> the number of polls conducted by the first pollster. What is the expected value <span class="math inline">\(\bar{Y}_2\)</span>?</p>
+<p>12. Suppose we define <span class="math inline">\(\bar{Y}_2\)</span> as the average of poll results from the first poll, <span class="math inline">\(Y_{2,1},\dots,Y_{2,N_2}\)</span>, where <span class="math inline">\(N_2\)</span> is the number of polls conducted by the first pollster. What is the expected value <span class="math inline">\(\bar{Y}_2\)</span>?</p>
 <p>13. What is the standard error of <span class="math inline">\(\bar{Y}_2\)</span> ?</p>
 <p>14. Using what we learned by answering the questions above, what is the expected value of <span class="math inline">\(\bar{Y}_{2} - \bar{Y}_1\)</span>?</p>
 <p>15. Using what we learned by answering the questions above, what is the standard error of <span class="math inline">\(\bar{Y}_{2} - \bar{Y}_1\)</span>?</p>
@@ -711,7 +844,7 @@ <h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-i
 <li>Note that <span class="math inline">\(\bar{Y}_2\)</span> and <span class="math inline">\(\bar{Y}_1\)</span> are sample averages, so if we assume <span class="math inline">\(N_2\)</span> and <span class="math inline">\(N_1\)</span> are large enough, each is approximately normal. The difference of normally distributed variables is also normally distributed.</li>
 <li>The data are not 0 or 1, so CLT does not apply.</li>
 </ol>
-<p>18. We have constructed a random variable that has expected value <span class="math inline">\(b_2 - b_1\)</span>, the pollster bias difference. If our model holds, then this random variable has an approximately normal distribution and we know its standard error. The standard error depends on <span class="math inline">\(\sigma_1\)</span> and <span class="math inline">\(\sigma_2\)</span>, but we can plug the sample standard deviations we computed above. We started off by asking: is <span class="math inline">\(b_2 - b_1\)</span> different from 0? Use all the information we have learned above to construct a 95% confidence interval for the difference <span class="math inline">\(b_2\)</span> and <span class="math inline">\(b_1\)</span>.</p>
+<p>18. We have constructed a random variable that has an expected value of <span class="math inline">\(b_2 - b_1\)</span>, representing the difference in pollster bias. If our model holds, then this random variable has an approximately normal distribution, and we know its standard error. The standard error depends on <span class="math inline">\(\sigma_1\)</span> and <span class="math inline">\(\sigma_2\)</span>, but we can plug the sample standard deviations we computed above. We began by asking: is <span class="math inline">\(b_2 - b_1\)</span> different from 0? Using all the information we have gathered above, construct a 95% confidence interval for the difference <span class="math inline">\(b_2 - b_1\)</span>.</p>
 <p>19. The confidence interval tells us there is relatively strong pollster effect resulting in a difference of about 5%. Random variability does not seem to explain it. We can compute a p-value to relay the fact that chance does not explain it. What is the p-value?</p>
 <p>20. The statistic formed by dividing our estimate of <span class="math inline">\(b_2-b_1\)</span> by its estimated standard error:</p>
 <p><span class="math display">\[
@@ -719,8 +852,8 @@ <h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-i
 \]</span></p>
 <p>is the t-statistic. Now notice that we have more than two pollsters. We can also test for pollster effect using all pollsters, not just two. The idea is to compare the variability across polls to variability within polls.</p>
 <p>For this exercise, create a new table:</p>
-<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-20_1805a0a7f3c572fd196c7850fd5478b7">
-<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls</span> <span class="op">&lt;-</span> <span class="va">polls_us_election_2016</span> <span class="op">|&gt;</span> </span>
+<div class="cell" data-layout-align="center" data-hash="treatment-effect-models_cache/html/unnamed-chunk-26_cdcd1a6c0dc7c234a78e120c5457495f">
+<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls</span> <span class="op">&lt;-</span> <span class="va">polls_us_election_2016</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">enddate</span> <span class="op">&gt;=</span> <span class="st">"2016-10-15"</span> <span class="op">&amp;</span></span>
 <span>           <span class="va">state</span> <span class="op">==</span> <span class="st">"U.S."</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">pollster</span><span class="op">)</span> <span class="op">|&gt;</span></span>
@@ -728,7 +861,7 @@ <h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-i
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>spread <span class="op">=</span> <span class="va">rawpoll_clinton</span><span class="op">/</span><span class="fl">100</span> <span class="op">-</span> <span class="va">rawpoll_trump</span><span class="op">/</span><span class="fl">100</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">ungroup</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Compute the average and standard deviation for each pollster and examine the variability across the averages and how it compares to the variability within the pollsters, summarized by the standard deviation.</p>
+<p>Compute the average and standard deviation for each pollster and examine the variability across the averages. Compare this to the variability within the pollsters, summarized by the standard deviation.</p>
 
 
 </section></main><!-- /main --><script id="quarto-html-after-body" type="application/javascript">
@@ -965,12 +1098,12 @@ <h1 class="title"><span id="sec-treatment-effect-models" class="quarto-section-i
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../linear-models/measurement-error-models.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../linear-models/association-tests.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/ml/algorithms.html b/docs/ml/algorithms.html
index 819bd09..feba72e 100644
--- a/docs/ml/algorithms.html
+++ b/docs/ml/algorithms.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 29&nbsp; Examples of algorithms</title>
+<title>Advanced Data Science - 30&nbsp; Examples of algorithms</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/algorithms.html"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/algorithms.html"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,49 +405,34 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li>
-<a href="#linear-regression" id="toc-linear-regression" class="nav-link active" data-scroll-target="#linear-regression"><span class="header-section-number">29.1</span> Linear regression</a>
-  <ul class="collapse">
-<li><a href="#the-predict-function" id="toc-the-predict-function" class="nav-link" data-scroll-target="#the-predict-function"><span class="header-section-number">29.1.1</span> The <code>predict</code> function</a></li>
-  </ul>
-</li>
-  <li>
-<a href="#logistic-regression" id="toc-logistic-regression" class="nav-link" data-scroll-target="#logistic-regression"><span class="header-section-number">29.2</span> Logistic regression</a>
-  <ul class="collapse">
-<li><a href="#generalized-linear-models" id="toc-generalized-linear-models" class="nav-link" data-scroll-target="#generalized-linear-models"><span class="header-section-number">29.2.1</span> Generalized linear models</a></li>
-  <li><a href="#logistic-regression-with-more-than-one-predictor" id="toc-logistic-regression-with-more-than-one-predictor" class="nav-link" data-scroll-target="#logistic-regression-with-more-than-one-predictor"><span class="header-section-number">29.2.2</span> Logistic regression with more than one predictor</a></li>
-  </ul>
-</li>
-  <li><a href="#k-nearest-neighbors" id="toc-k-nearest-neighbors" class="nav-link" data-scroll-target="#k-nearest-neighbors"><span class="header-section-number">29.3</span> k-nearest neighbors</a></li>
+<li><a href="#logistic-regression" id="toc-logistic-regression" class="nav-link active" data-scroll-target="#logistic-regression"><span class="header-section-number">30.1</span> Logistic regression</a></li>
+  <li><a href="#k-nearest-neighbors" id="toc-k-nearest-neighbors" class="nav-link" data-scroll-target="#k-nearest-neighbors"><span class="header-section-number">30.2</span> k-nearest neighbors</a></li>
   <li>
-<a href="#generative-models" id="toc-generative-models" class="nav-link" data-scroll-target="#generative-models"><span class="header-section-number">29.4</span> Generative models</a>
+<a href="#generative-models" id="toc-generative-models" class="nav-link" data-scroll-target="#generative-models"><span class="header-section-number">30.3</span> Generative models</a>
   <ul class="collapse">
-<li><a href="#naive-bayes" id="toc-naive-bayes" class="nav-link" data-scroll-target="#naive-bayes"><span class="header-section-number">29.4.1</span> Naive Bayes</a></li>
-  <li><a href="#controlling-prevalence" id="toc-controlling-prevalence" class="nav-link" data-scroll-target="#controlling-prevalence"><span class="header-section-number">29.4.2</span> Controlling prevalence</a></li>
-  <li><a href="#quadratic-discriminant-analysis" id="toc-quadratic-discriminant-analysis" class="nav-link" data-scroll-target="#quadratic-discriminant-analysis"><span class="header-section-number">29.4.3</span> Quadratic discriminant analysis</a></li>
-  <li><a href="#linear-discriminant-analysis" id="toc-linear-discriminant-analysis" class="nav-link" data-scroll-target="#linear-discriminant-analysis"><span class="header-section-number">29.4.4</span> Linear discriminant analysis</a></li>
-  <li><a href="#connection-to-distance" id="toc-connection-to-distance" class="nav-link" data-scroll-target="#connection-to-distance"><span class="header-section-number">29.4.5</span> Connection to distance</a></li>
-  <li><a href="#case-study-more-than-three-classes" id="toc-case-study-more-than-three-classes" class="nav-link" data-scroll-target="#case-study-more-than-three-classes"><span class="header-section-number">29.4.6</span> Case study: more than three classes</a></li>
+<li><a href="#naive-bayes" id="toc-naive-bayes" class="nav-link" data-scroll-target="#naive-bayes"><span class="header-section-number">30.3.1</span> Naive Bayes</a></li>
+  <li><a href="#controlling-prevalence" id="toc-controlling-prevalence" class="nav-link" data-scroll-target="#controlling-prevalence"><span class="header-section-number">30.3.2</span> Controlling prevalence</a></li>
+  <li><a href="#quadratic-discriminant-analysis" id="toc-quadratic-discriminant-analysis" class="nav-link" data-scroll-target="#quadratic-discriminant-analysis"><span class="header-section-number">30.3.3</span> Quadratic discriminant analysis</a></li>
+  <li><a href="#linear-discriminant-analysis" id="toc-linear-discriminant-analysis" class="nav-link" data-scroll-target="#linear-discriminant-analysis"><span class="header-section-number">30.3.4</span> Linear discriminant analysis</a></li>
+  <li><a href="#connection-to-distance" id="toc-connection-to-distance" class="nav-link" data-scroll-target="#connection-to-distance"><span class="header-section-number">30.3.5</span> Connection to distance</a></li>
   </ul>
 </li>
   <li>
-<a href="#sec-trees" id="toc-sec-trees" class="nav-link" data-scroll-target="#sec-trees"><span class="header-section-number">29.5</span> Classification and regression trees (CART)</a>
+<a href="#sec-trees" id="toc-sec-trees" class="nav-link" data-scroll-target="#sec-trees"><span class="header-section-number">30.4</span> Classification and regression trees (CART)</a>
   <ul class="collapse">
-<li><a href="#the-curse-of-dimensionality" id="toc-the-curse-of-dimensionality" class="nav-link" data-scroll-target="#the-curse-of-dimensionality"><span class="header-section-number">29.5.1</span> The curse of dimensionality</a></li>
-  <li><a href="#cart-motivation" id="toc-cart-motivation" class="nav-link" data-scroll-target="#cart-motivation"><span class="header-section-number">29.5.2</span> CART motivation</a></li>
-  <li><a href="#regression-trees" id="toc-regression-trees" class="nav-link" data-scroll-target="#regression-trees"><span class="header-section-number">29.5.3</span> Regression trees</a></li>
-  <li><a href="#classification-decision-trees" id="toc-classification-decision-trees" class="nav-link" data-scroll-target="#classification-decision-trees"><span class="header-section-number">29.5.4</span> Classification (decision) trees</a></li>
-  <li><a href="#random-forests" id="toc-random-forests" class="nav-link" data-scroll-target="#random-forests"><span class="header-section-number">29.5.5</span> Random forests</a></li>
+<li><a href="#the-curse-of-dimensionality" id="toc-the-curse-of-dimensionality" class="nav-link" data-scroll-target="#the-curse-of-dimensionality"><span class="header-section-number">30.4.1</span> The curse of dimensionality</a></li>
+  <li><a href="#cart-motivation" id="toc-cart-motivation" class="nav-link" data-scroll-target="#cart-motivation"><span class="header-section-number">30.4.2</span> CART motivation</a></li>
+  <li><a href="#regression-trees" id="toc-regression-trees" class="nav-link" data-scroll-target="#regression-trees"><span class="header-section-number">30.4.3</span> Regression trees</a></li>
+  <li><a href="#classification-decision-trees" id="toc-classification-decision-trees" class="nav-link" data-scroll-target="#classification-decision-trees"><span class="header-section-number">30.4.4</span> Classification (decision) trees</a></li>
   </ul>
 </li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">29.6</span> Exercises</a></li>
+  <li><a href="#random-forests" id="toc-random-forests" class="nav-link" data-scroll-target="#random-forests"><span class="header-section-number">30.5</span> Random forests</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">30.6</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/ml/algorithms.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
-<h1 class="title">
-<span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span>
-</h1>
+<h1 class="title"><span id="sec-example-alogirhms" class="quarto-section-identifier"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></h1>
 </div>
 
 
@@ -454,256 +445,31 @@ <h1 class="title">
   </div>
   
 
-</header><p>There are dozens of machine learning algorithms. Here we provide a few examples spanning rather different approaches. Throughout the chapter we will be using the two predictor digits data introduced in Section <a href="smoothing.html#sec-two-or-seven"><span>Section&nbsp;27.1</span></a> to demonstrate how the algorithms work.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-1_79ce3003f99ccc0f98fde914f5b4b2cf">
+</header><p>There are hundreds of machine learning algorithms. Here we provide a few examples spanning rather different approaches. Throughout the chapter, we will be using the two predictor digits data introduced in <a href="smoothing.html#sec-two-or-seven"><span>Section&nbsp;28.1</span></a> to demonstrate how the algorithms work. We focus on the concepts and ideas behind the algorithms using illustrative datasets from the <strong>dslabs</strong> package.</p>
+<div class="cell" data-layout-align="center">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
-<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
-<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<section id="linear-regression" class="level2" data-number="29.1"><h2 data-number="29.1" class="anchored" data-anchor-id="linear-regression">
-<span class="header-section-number">29.1</span> Linear regression</h2>
-<p>Linear regression can be considered a machine learning algorithm. In Section <a href="smoothing.html#sec-two-or-seven"><span>Section&nbsp;27.1</span></a> we demonstrated how linear regression can be too rigid to be useful. This is generally true, but for some challenges it works rather well. It also serves as a baseline approach: if you can’t beat it with a more complex approach, you probably want to stick to linear regression. To quickly make the connection between regression and machine learning, we will reformulate Galton’s study with heights, a continuous outcome.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-2_820ea6015af0327ca87a0cbe33856c6d">
-<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">HistData</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1983</span><span class="op">)</span></span>
-<span><span class="va">galton_heights</span> <span class="op">&lt;-</span> <span class="va">GaltonFamilies</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">gender</span> <span class="op">==</span> <span class="st">"male"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">family</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/sample_n.html">sample_n</a></span><span class="op">(</span><span class="fl">1</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">ungroup</a></span><span class="op">(</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">father</span>, <span class="va">childHeight</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/rename.html">rename</a></span><span class="op">(</span>son <span class="op">=</span> <span class="va">childHeight</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Suppose you are tasked with building a machine learning algorithm that predicts the son’s height <span class="math inline">\(Y\)</span> using the father’s height <span class="math inline">\(X\)</span>. Let’s generate testing and training sets:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-3_43b4020dc31fddb111f2a4468f0ccb7d">
-<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">galton_heights</span><span class="op">$</span><span class="va">son</span></span>
-<span><span class="va">test_index</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/createDataPartition.html">createDataPartition</a></span><span class="op">(</span><span class="va">y</span>, times <span class="op">=</span> <span class="fl">1</span>, p <span class="op">=</span> <span class="fl">0.5</span>, list <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
-<span><span class="va">train_set</span> <span class="op">&lt;-</span> <span class="va">galton_heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/slice.html">slice</a></span><span class="op">(</span><span class="op">-</span><span class="va">test_index</span><span class="op">)</span></span>
-<span><span class="va">test_set</span> <span class="op">&lt;-</span> <span class="va">galton_heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/slice.html">slice</a></span><span class="op">(</span><span class="va">test_index</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>In this case, if we were just ignoring the father’s height and guessing the son’s height, we would guess the average height of sons.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-4_054f7212fba7f68f4c0199fbe76521c8">
-<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">m</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">train_set</span><span class="op">$</span><span class="va">son</span><span class="op">)</span></span>
-<span><span class="va">m</span></span>
-<span><span class="co">#&gt; [1] 69.2</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Our root mean squared error is:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-5_4d28bb83e85bb2c86b78049f3253d6a6">
-<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="op">(</span><span class="va">m</span> <span class="op">-</span> <span class="va">test_set</span><span class="op">$</span><span class="va">son</span><span class="op">)</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 2.77</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Can we do better? In the regression chapter, we learned that if the pair <span class="math inline">\((X,Y)\)</span> follow a bivariate normal distribution, the conditional expectation (what we want to estimate) is equivalent to the regression line:</p>
-<p><span class="math display">\[
-f(x) = \mbox{E}( Y  \mid  X = x ) = \beta_0 + \beta_1 x
-\]</span></p>
-<p>In Section <a href="../linear-models/regression.html#sec-lse"><span>Section&nbsp;13.10</span></a> we introduced least squares as a method for estimating the slope <span class="math inline">\(\beta_0\)</span> and intercept <span class="math inline">\(\beta_1\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-6_db4be6d455f82647cb0833be6c7d44cf">
-<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">son</span> <span class="op">~</span> <span class="va">father</span>, data <span class="op">=</span> <span class="va">train_set</span><span class="op">)</span></span>
-<span><span class="va">fit</span><span class="op">$</span><span class="va">coef</span></span>
-<span><span class="co">#&gt; (Intercept)      father </span></span>
-<span><span class="co">#&gt;      35.976       0.482</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>This gives us an estimate of the conditional expectation:</p>
-<p><span class="math display">\[ \hat{f}(x) = 35 + 0.5 x \]</span></p>
-<p>We can see that this does indeed provide an improvement over our guessing approach.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-7_d22f792cbb6edf1ef844cca5e716b4d2">
-<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="va">fit</span><span class="op">$</span><span class="va">coef</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span> <span class="op">+</span> <span class="va">fit</span><span class="op">$</span><span class="va">coef</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span><span class="op">*</span><span class="va">test_set</span><span class="op">$</span><span class="va">father</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="op">(</span><span class="va">y_hat</span> <span class="op">-</span> <span class="va">test_set</span><span class="op">$</span><span class="va">son</span><span class="op">)</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 2.54</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<section id="the-predict-function" class="level3" data-number="29.1.1"><h3 data-number="29.1.1" class="anchored" data-anchor-id="the-predict-function">
-<span class="header-section-number">29.1.1</span> The <code>predict</code> function</h3>
-<p>The <code>predict</code> function is very useful for machine learning applications. This function takes a fitted object from functions such as <code>lm</code> or <code>glm</code> (we learn about <code>glm</code> soon) and a data frame with the new predictors for which to predict. So in our current example, we would use <code>predict</code> like this:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-8_d89b9bfacd261a407c8c33ef3764cabd">
-<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit</span>, <span class="va">test_set</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Using <code>predict</code>, we can get the same results as we did previously:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-9_87e6c85598628d0925005d8d3164a92f">
-<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit</span>, <span class="va">test_set</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="op">(</span><span class="va">y_hat</span> <span class="op">-</span> <span class="va">test_set</span><span class="op">$</span><span class="va">son</span><span class="op">)</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 2.54</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p><code>predict</code> does not always return objects of the same types; it depends on what type of object is sent to it. To learn about the specifics, you need to look at the help file specific for the type of fit object that is being used. The <code>predict</code> is actually a special type of function in R (called a <em>generic function</em>) that calls other functions depending on what kind of object it receives. So if <code>predict</code> receives an object coming out of the <code>lm</code> function, it will call <code>predict.lm</code>. If it receives an object coming out of <code>glm</code>, it calls <code>predict.glm</code>. These two functions are similar but different. You can learn more about the differences by reading the help files:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-10_aca743d12adcb0752c763cf35da9c3e6">
-<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="op">?</span><span class="va">predict.lm</span></span>
-<span><span class="op">?</span><span class="va">predict.glm</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>There are many other versions of <code>predict</code> and many machine learning algorithms have a <code>predict</code> function.</p>
-<div class="callout callout-style-simple callout-note">
-<div class="callout-body d-flex">
-<div class="callout-icon-container">
-<i class="callout-icon"></i>
-</div>
-<div class="callout-body-container">
-<p>You are ready to do exercises 1 - 8.</p>
-</div>
-</div>
-</div>
-</section></section><section id="logistic-regression" class="level2" data-number="29.2"><h2 data-number="29.2" class="anchored" data-anchor-id="logistic-regression">
-<span class="header-section-number">29.2</span> Logistic regression</h2>
-<p>The regression approach can be extended to categorical data. In this section we first illustrate how, for binary data, one can simply assign numeric values of 0 and 1 to the outcomes <span class="math inline">\(y\)</span>, and apply regression as if the data were continuous. We will then point out a limitation with this approach and introduce <em>logistic regression</em> as a solution. Logistic regression is a specific case of a set of <em>generalized linear models</em>. To illustrate logistic regression, we will apply it to our previous predicting sex example defined in Section <a href="evaluation-metrics.html#sec-training-test"><span>Section&nbsp;25.1</span></a>.</p>
-<p>If we define the outcome <span class="math inline">\(Y\)</span> as 1 for females and 0 for males, and <span class="math inline">\(X\)</span> as the height, we are interested in the conditional probability:</p>
-<p><span class="math display">\[
-\mbox{Pr}( Y = 1 \mid X = x)
-\]</span></p>
-<p>As an example, let’s provide a prediction for a student that is 66 inches tall. What is the conditional probability of being female if you are 66 inches tall? In our dataset, we can estimate this by rounding to the nearest inch and computing:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-12_2d66cf55f6f0a39dc3867f61655073b7">
-<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_set</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span> <span class="op">==</span> <span class="fl">66</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>y_hat <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">sex</span> <span class="op">==</span> <span class="st">"Female"</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="co">#&gt;   y_hat</span></span>
-<span><span class="co">#&gt; 1 0.347</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>To construct a prediction algorithm, we want to estimate the proportion of the population that is female for any given height <span class="math inline">\(X = x\)</span>, which we write as the conditional probability described above: <span class="math inline">\(\mbox{Pr}( Y = 1 | X = x)\)</span>. Let’s see what this looks like for several values of <span class="math inline">\(x\)</span> (we will remove strata of <span class="math inline">\(x\)</span> with few data points):</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/height-and-sex-conditional-probabilities_d10c3a3ac0cfb5d8b2ab2f0bcfbbdb63">
-<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">heights</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>x <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/context.html">n</a></span><span class="op">(</span><span class="op">)</span> <span class="op">&gt;=</span> <span class="fl">10</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>prop <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">sex</span> <span class="op">==</span> <span class="st">"Female"</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">prop</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/height-and-sex-conditional-probabilities-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>Since the results from the plot above look close to linear, and it is the only approach we currently know, we will try regression. We assume that:</p>
-<p><span class="math display">\[p(x) = \mbox{Pr}( Y = 1 | X = x)  = \beta_0 + \beta_1 x\]</span></p>
-<p>Note: because <span class="math inline">\(p_0(x) = 1 - p_1(x)\)</span>, we will only estimate <span class="math inline">\(p_1(x)\)</span> and drop the <span class="math inline">\(_1\)</span> index.</p>
-<p>If we convert the factors to 0s and 1s, we can estimate <span class="math inline">\(\beta_0\)</span> and <span class="math inline">\(\beta_1\)</span> with least squares.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-13_e742429c41e53b0e3ca86509b9835178">
-<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">lm_fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span><span class="va">train_set</span>, y <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/numeric.html">as.numeric</a></span><span class="op">(</span><span class="va">sex</span> <span class="op">==</span> <span class="st">"Female"</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">height</span>, data <span class="op">=</span> <span class="va">_</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Once we have estimates <span class="math inline">\(\hat{\beta}_0\)</span> and <span class="math inline">\(\hat{\beta}_1\)</span>, we can obtain an actual prediction. Our estimate of the conditional probability <span class="math inline">\(p(x)\)</span> is:</p>
-<p><span class="math display">\[
-\hat{p}(x) = \hat{\beta}_0+ \hat{\beta}_1 x
-\]</span></p>
-<p>To form a prediction, we define a <em>decision rule</em>: predict female if <span class="math inline">\(\hat{p}(x) &gt; 0.5\)</span>. We can compare our predictions to the outcomes using:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-14_576cb3cd6dcf781bb80600b6ab05148a">
-<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">lm_fit</span>, <span class="va">test_set</span><span class="op">)</span></span>
-<span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">p_hat</span> <span class="op">&gt;</span> <span class="fl">0.5</span>, <span class="st">"Female"</span>, <span class="st">"Male"</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[[</span><span class="st">"Accuracy"</span><span class="op">]</span><span class="op">]</span></span>
-<span><span class="co">#&gt; [1] 0.798</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>We see this method does substantially better than guessing.</p>
-<section id="generalized-linear-models" class="level3" data-number="29.2.1"><h3 data-number="29.2.1" class="anchored" data-anchor-id="generalized-linear-models">
-<span class="header-section-number">29.2.1</span> Generalized linear models</h3>
-<p>The function <span class="math inline">\(\beta_0 + \beta_1 x\)</span> can take any value including negatives and values larger than 1. In fact, the estimate <span class="math inline">\(\hat{p}(x)\)</span> computed in the linear regression section does indeed become negative.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/regression-prediction_86404f6f1ba2b98641a4bd4dfa6e568d">
-<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">heights</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>x <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/context.html">n</a></span><span class="op">(</span><span class="op">)</span> <span class="op">&gt;=</span> <span class="fl">10</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>prop <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">sex</span> <span class="op">==</span> <span class="st">"Female"</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">prop</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_abline.html">geom_abline</a></span><span class="op">(</span>intercept <span class="op">=</span> <span class="va">lm_fit</span><span class="op">$</span><span class="va">coef</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span>, slope <span class="op">=</span> <span class="va">lm_fit</span><span class="op">$</span><span class="va">coef</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/regression-prediction-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>The range is:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-15_41902d0d56f5e61f3fc70bf640b89018">
-<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/range.html">range</a></span><span class="op">(</span><span class="va">p_hat</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] -0.578  1.262</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>But we are estimating a probability: <span class="math inline">\(\mbox{Pr}( Y = 1 \mid X = x)\)</span> which is constrained between 0 and 1.</p>
-<p>The idea of generalized linear models (GLM) is to 1) define a distribution of <span class="math inline">\(Y\)</span> that is consistent with it’s possible outcomes and 2) find a function <span class="math inline">\(g\)</span> so that <span class="math inline">\(g(\mbox{Pr}( Y = 1 \mid X = x))\)</span> can be modeled as a linear combination of predictors. Logistic regression is the most commonly used GLM. It is an extension of linear regression that assures that the estimate of <span class="math inline">\(\mbox{Pr}( Y = 1 \mid X = x)\)</span> is between 0 and 1. This approach makes use of the <em>logistic</em> transformation defined as:</p>
-<p><span class="math display">\[ g(p) = \log \frac{p}{1-p}.\]</span></p>
-<p>This logistic transformation converts probability to log odds. As discussed in the data visualization lecture, the odds tell us how much more likely it is something will happen compared to not happening. <span class="math inline">\(p = 0.5\)</span> means the odds are 1 to 1, thus the odds are 1. If <span class="math inline">\(p = 0.75\)</span>, the odds are 3 to 1. A nice characteristic of this transformation is that it converts probabilities to be symmetric around 0. Here is a plot of <span class="math inline">\(g(p)\)</span> versus <span class="math inline">\(p\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/p-versus-logistic-of-p_446e11dd21ca21bd369375c7fbe89e9c">
-<pre><code>#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.</code></pre>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/p-versus-logistic-of-p-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
+<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span>
+<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>With <em>logistic regression</em>, we model the conditional probability directly with:</p>
+<p>Later, in <a href="ml-in-practice.html"><span>Chapter&nbsp;31</span></a>, we show an efficient way to implement these ideas using the <strong>caret</strong> package.</p>
+<section id="logistic-regression" class="level2" data-number="30.1"><h2 data-number="30.1" class="anchored" data-anchor-id="logistic-regression">
+<span class="header-section-number">30.1</span> Logistic regression</h2>
+<p>In <a href="smoothing.html#sec-two-or-seven"><span>Section&nbsp;28.1</span></a>, we used linear regression to predict classes by fitting the model:</p>
 <p><span class="math display">\[
-g\left\{ \mbox{Pr}(Y = 1 \mid X = x) \right\} = \beta_0 + \beta_1 x
-\]</span></p>
-<p>With this model, we can no longer use least squares. Instead we compute the <em>maximum likelihood estimate</em> (MLE). You can learn more about this concept in a statistical theory textbook<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>.</p>
-<p>In R, we can fit the logistic regression model with the function <code>glm</code>: generalized linear models. This function is more general than logistic regression so we need to specify the model we want through the <code>family</code> parameter:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-16_cc7f65da3815b8755063444c08b23eb5">
-<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">glm_fit</span> <span class="op">&lt;-</span> <span class="va">train_set</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>y <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/numeric.html">as.numeric</a></span><span class="op">(</span><span class="va">sex</span> <span class="op">==</span> <span class="st">"Female"</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/stats/glm.html">glm</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">height</span>, data <span class="op">=</span> <span class="va">_</span>, family <span class="op">=</span> <span class="st">"binomial"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>We can obtain prediction using the predict function:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-17_df18a67487851b49291c54768e197351">
-<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p_hat_logit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">glm_fit</span>, newdata <span class="op">=</span> <span class="va">test_set</span>, type <span class="op">=</span> <span class="st">"response"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>When using <code>predict</code> with a <code>glm</code> object, we have to specify that we want <code>type = "response"</code> if we want the conditional probabilities, since the default is to return the logistic transformed values.</p>
-<p>This model fits the data slightly better than the line:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/conditional-prob-glm-fit_480a7c9d28e2b2138dfe0f0e2cf0e14e">
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/conditional-prob-glm-fit-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>Because we have an estimate <span class="math inline">\(\hat{p}(x)\)</span>, we can obtain predictions:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-18_5f3057cc512dc154aa03288b7fa1fff3">
-<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_logit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">p_hat_logit</span> <span class="op">&gt;</span> <span class="fl">0.5</span>, <span class="st">"Female"</span>, <span class="st">"Male"</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat_logit</span>, <span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[[</span><span class="st">"Accuracy"</span><span class="op">]</span><span class="op">]</span></span>
-<span><span class="co">#&gt; [1] 0.808</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>The resulting predictions are similar. This is because the two estimates of <span class="math inline">\(p(x)\)</span> are larger than 1/2 in about the same region of x:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/glm-prediction_fdaad16cc293364f53415371d9354bc6">
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/glm-prediction-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
+p(\mathbf{x}) = \mbox{Pr}(Y=1 \mid X_1=x_1 , X_2 = x_2) =
+\beta_0 + \beta_1 x_1 + \beta_2 x_2
+\]</span> using least squares after assigning numeric values of 0 and 1 to the outcomes <span class="math inline">\(y\)</span>, and applied regression as if the data were continuous. A obvious problem with this approach is that <span class="math inline">\(\hat{p}(\mathbf{x})\)</span> can be negative and larger than 1:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-2_cae4c7ff9bea665833cfa40f8d1f87d1">
+<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit_lm</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">x_1</span> <span class="op">+</span> <span class="va">x_2</span>, data <span class="op">=</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span><span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>,y <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">y</span> <span class="op">==</span> <span class="fl">7</span>, <span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/range.html">range</a></span><span class="op">(</span><span class="va">fit_lm</span><span class="op">$</span><span class="va">fitted</span><span class="op">)</span></span>
+<span><span class="co">#&gt; [1] -0.22  1.92</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Both linear and logistic regressions provide an estimate for the conditional expectation:</p>
+<p>To avoid this, we can apply the approach described in <a href="../linear-models/association-tests.html#sec-glm"><span>Section&nbsp;18.5</span></a> that is more appropriate for binary data. We write the model like this:</p>
 <p><span class="math display">\[
-\mbox{E}(Y \mid X = x)
-\]</span> which in the case of binary data is equivalent to the conditional probability:</p>
-<p><span class="math display">\[
-\mbox{Pr}(Y = 1 \mid X = x)
-\]</span></p>
-</section><section id="logistic-regression-with-more-than-one-predictor" class="level3" data-number="29.2.2"><h3 data-number="29.2.2" class="anchored" data-anchor-id="logistic-regression-with-more-than-one-predictor">
-<span class="header-section-number">29.2.2</span> Logistic regression with more than one predictor</h3>
-<p>In this section we apply logistic regression to the two or seven data introduced in Section <a href="smoothing.html#sec-two-or-seven"><span>Section&nbsp;27.1</span></a>. In this case, we are interested in estimating a conditional probability that depends on two variables. The standard logistic regression model in this case will assume that</p>
-<p><span class="math display">\[
-g\{p(x_1, x_2)\}= g\{\mbox{Pr}(Y = 1 \mid X_1 = x_1 , X_2 = x_2)\} =
-\beta_0 + \beta_1 x_1 + \beta_2 x_2
+\log \frac{p(\mathbf{x})}{1-p(\mathbf{x})} = \beta_0 + \beta_1 x_1 + \beta_2 x_2
 \]</span></p>
-<p>with <span class="math inline">\(g(p) = \log \frac{p}{1-p}\)</span> the logistic function described in the previous section. To fit the model we use the following code:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-19_13d99738981185a5dbd3b0af5a6b9cc2">
-<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit_glm</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/glm.html">glm</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">x_1</span> <span class="op">+</span> <span class="va">x_2</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>, family <span class="op">=</span> <span class="st">"binomial"</span><span class="op">)</span></span>
-<span><span class="va">p_hat_glm</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit_glm</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"response"</span><span class="op">)</span></span>
-<span><span class="va">y_hat_glm</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">p_hat_glm</span> <span class="op">&gt;</span> <span class="fl">0.5</span>, <span class="fl">7</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat_glm</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
-<span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;     0.75</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Comparing to the results we obtained in Section <a href="smoothing.html#sec-two-or-seven"><span>Section&nbsp;27.1</span></a>, we see that logistic regression performs similarly to regression. This is not surprising, given that the estimate of <span class="math inline">\(\hat{p}(x_1, x_2)\)</span> looks similar as well:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/logistic-p-hat_df32c5772da986e6b3ac27dd893b0b10">
-<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit_glm</span>, newdata <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">true_p</span>, type <span class="op">=</span> <span class="st">"response"</span><span class="op">)</span></span>
-<span><span class="va">mnist_27</span><span class="op">$</span><span class="va">true_p</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>p_hat <span class="op">=</span> <span class="va">p_hat</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span>,  z <span class="op">=</span> <span class="va">p_hat</span>, fill <span class="op">=</span> <span class="va">p_hat</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_tile.html">geom_raster</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/scale_gradient.html">scale_fill_gradientn</a></span><span class="op">(</span>colors <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"#F8766D"</span>,<span class="st">"white"</span>,<span class="st">"#00BFC4"</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_contour.html">stat_contour</a></span><span class="op">(</span>breaks <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0.5</span><span class="op">)</span>, color <span class="op">=</span> <span class="st">"black"</span><span class="op">)</span> </span>
-<span><span class="co">#&gt; Warning: The following aesthetics were dropped during statistical</span></span>
-<span><span class="co">#&gt; transformation: fill</span></span>
-<span><span class="co">#&gt; ℹ This can happen when ggplot fails to infer the correct grouping</span></span>
-<span><span class="co">#&gt;   structure in the data.</span></span>
-<span><span class="co">#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a</span></span>
-<span><span class="co">#&gt;   numerical variable into a factor?</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>We can then find the <em>maximum likelihood estimates</em> (MLE) of the model parameters and predict using the estimate <span class="math inline">\(p(\mathbf{x})\)</span> to obtain an accuracy of 0.775. We see that logistic regression performs similarly to regression. This is not surprising given that the estimate of <span class="math inline">\(\hat{p}(\mathbf{x})\)</span> looks similar as well:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/logistic-p-hat_88fb5544939c0fe6b42128cfe8641f65">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/logistic-p-hat-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -711,58 +477,27 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Just like regression, the decision rule is a line, a fact that can be corroborated mathematically since</p>
+<p>Just like regression, the decision rule is a line, a fact that can be corroborated mathematically. Defining <span class="math inline">\(g(x) = \log \{x/(1-x)\}\)</span>, we have:</p>
 <p><span class="math display">\[
 g^{-1}(\hat{\beta}_0 + \hat{\beta}_1 x_1 + \hat{\beta}_2 x_2) = 0.5 \implies
 \hat{\beta}_0 + \hat{\beta}_1 x_1 + \hat{\beta}_2 x_2 = g(0.5) = 0 \implies
 x_2 = -\hat{\beta}_0/\hat{\beta}_2 -\hat{\beta}_1/\hat{\beta}_2 x_1
 \]</span></p>
-<p>Thus <span class="math inline">\(x_2\)</span> is a linear function of <span class="math inline">\(x_1\)</span>. This implies that, just like regression, our logistic regression approach has no chance of capturing the non-linear nature of the true <span class="math inline">\(p(x_1,x_2)\)</span>. Once we move on to more complex examples, we will see that linear regression and generalized linear regression are limited and not flexible enough to be useful for most machine learning challenges. The new techniques we learn are essentially approaches to estimating the conditional probability in a way that is more flexible.</p>
+<p>Thus, much like with regression, <span class="math inline">\(x_2\)</span> is a linear function of <span class="math inline">\(x_1\)</span>. This implies that our logistic regression approach has no chance of capturing the non-linear nature of the true <span class="math inline">\(p(\mathbf{x})\)</span>. We now describe some techniques that estimate the conditional probability in a more flexible way.</p>
 <div class="callout callout-style-simple callout-note">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>You can now do exeercises 9 - 11.</p>
-</div>
+<p>You are ready to do exercises 1 - 11.</p>
 </div>
 </div>
-</section></section><section id="k-nearest-neighbors" class="level2" data-number="29.3"><h2 data-number="29.3" class="anchored" data-anchor-id="k-nearest-neighbors">
-<span class="header-section-number">29.3</span> k-nearest neighbors</h2>
-<p>We introduced the kNN algorithm in Section <a href="cross-validation.html#sec-knn-cv-intro"><span>Section&nbsp;28.1</span></a>) and demonstrated how we use cross validation to pick <span class="math inline">\(k\)</span> in Section @ref(caret-cv). Here we quickly review how we fit a kNN model using the <strong>caret</strong> package. In Section @ref(caret-cv we introduced the following code to fit a kNN model:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-22_f3bf5b0005974e9fcc02eb2e34bee9d5">
-<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"knn"</span>, </span>
-<span>                   data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>,</span>
-<span>                   tuneGrid <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>k <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">9</span>, <span class="fl">71</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We saw that the parameter that maximized the estimated accuracy was:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-23_c5da0a96c04091665cfdc446ea71d0fa">
-<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_knn</span><span class="op">$</span><span class="va">bestTune</span></span>
-<span><span class="co">#&gt;     k</span></span>
-<span><span class="co">#&gt; 10 27</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>This model improves the accuracy over regression and logistic regression:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-24_2fa23f91d1a944ee3f4ccdedafb9ac97">
-<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_knn</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"raw"</span><span class="op">)</span>,</span>
-<span>                <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
-<span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;    0.835</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>A plot of the estimated conditional probability shows that the kNN estimate is flexible enough and does indeed capture the shape of the true conditional probability.</p>
+</section><section id="k-nearest-neighbors" class="level2" data-number="30.2"><h2 data-number="30.2" class="anchored" data-anchor-id="k-nearest-neighbors">
+<span class="header-section-number">30.2</span> k-nearest neighbors</h2>
+<p>We introduced the kNN algorithm in <a href="cross-validation.html#sec-knn-cv-intro"><span>Section&nbsp;29.1</span></a>. In <a href="cross-validation.html#sec-mse-estimates"><span>Section&nbsp;29.7.1</span></a>, we noted that <span class="math inline">\(k=31\)</span> provided the highest accuracy in the test set. Using <span class="math inline">\(k=31\)</span>, we obtain an accuracy 0.825, an improvement over regression. A plot of the estimated conditional probability shows that the kNN estimate is flexible enough and does indeed capture the shape of the true conditional probability.</p>
 <div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/best-knn-fit_4b99f8177f3a74903194c35349ad394a">
-<pre><code>#&gt; Warning: The following aesthetics were dropped during statistical
-#&gt; transformation: fill
-#&gt; ℹ This can happen when ggplot fails to infer the correct grouping
-#&gt;   structure in the data.
-#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a
-#&gt;   numerical variable into a factor?
-#&gt; The following aesthetics were dropped during statistical
-#&gt; transformation: fill
-#&gt; ℹ This can happen when ggplot fails to infer the correct grouping
-#&gt;   structure in the data.
-#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a
-#&gt;   numerical variable into a factor?</code></pre>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/best-knn-fit-1.png" class="img-fluid figure-img" style="width:100.0%"></p>
@@ -780,40 +515,34 @@ <h1 class="title">
 </div>
 </div>
 </div>
-</section><section id="generative-models" class="level2" data-number="29.4"><h2 data-number="29.4" class="anchored" data-anchor-id="generative-models">
-<span class="header-section-number">29.4</span> Generative models</h2>
-<p>We have described how, when using squared loss, the conditional expectation/probabilities provide the best approach to developing a decision rule. In a binary case, the smallest true error we can achieve is determined by Bayes’ rule, which is a decision rule based on the true conditional probability:</p>
+</section><section id="generative-models" class="level2" data-number="30.3"><h2 data-number="30.3" class="anchored" data-anchor-id="generative-models">
+<span class="header-section-number">30.3</span> Generative models</h2>
+<p>We have described how, when using squared loss, the conditional expectation provides the best approach to developing a decision rule. In a binary case, the smallest true error we can achieve is determined by Bayes’ rule, which is a decision rule based on the true conditional probability:</p>
 <p><span class="math display">\[
 p(\mathbf{x}) = \mbox{Pr}(Y = 1 \mid \mathbf{X}=\mathbf{x})
 \]</span></p>
 <p>We have described several approaches to estimating <span class="math inline">\(p(\mathbf{x})\)</span>. In all these approaches, we estimate the conditional probability directly and do not consider the distribution of the predictors. In machine learning, these are referred to as <em>discriminative</em> approaches.</p>
 <p>However, Bayes’ theorem tells us that knowing the distribution of the predictors <span class="math inline">\(\mathbf{X}\)</span> may be useful. Methods that model the joint distribution of <span class="math inline">\(Y\)</span> and <span class="math inline">\(\mathbf{X}\)</span> are referred to as <em>generative models</em> (we model how the entire data, <span class="math inline">\(\mathbf{X}\)</span> and <span class="math inline">\(Y\)</span>, are generated). We start by describing the most general generative model, Naive Bayes, and then proceed to describe two specific cases, quadratic discriminant analysis (QDA) and linear discriminant analysis (LDA).</p>
-<section id="naive-bayes" class="level3" data-number="29.4.1"><h3 data-number="29.4.1" class="anchored" data-anchor-id="naive-bayes">
-<span class="header-section-number">29.4.1</span> Naive Bayes</h3>
-<p>Recall that Bayes rule tells us that we can rewrite <span class="math inline">\(p(\mathbf{x})\)</span> like this:</p>
+<section id="naive-bayes" class="level3" data-number="30.3.1"><h3 data-number="30.3.1" class="anchored" data-anchor-id="naive-bayes">
+<span class="header-section-number">30.3.1</span> Naive Bayes</h3>
+<p>Recall that Bayes rule tells us that we can rewrite <span class="math inline">\(p(\mathbf{x})\)</span> as follows:</p>
 <p><span class="math display">\[
 p(\mathbf{x}) = \mbox{Pr}(Y = 1|\mathbf{X}=\mathbf{x}) = \frac{f_{\mathbf{X}|Y = 1}(\mathbf{x}) \mbox{Pr}(Y = 1)}
 { f_{\mathbf{X}|Y = 0}(\mathbf{x})\mbox{Pr}(Y = 0)  + f_{\mathbf{X}|Y = 1}(\mathbf{x})\mbox{Pr}(Y = 1) }
 \]</span></p>
-<p>with <span class="math inline">\(f_{\mathbf{X}|Y = 1}\)</span> and <span class="math inline">\(f_{\mathbf{X}|Y = 0}\)</span> representing the distribution functions of the predictor <span class="math inline">\(\mathbf{X}\)</span> for the two classes <span class="math inline">\(Y = 1\)</span> and <span class="math inline">\(Y = 0\)</span>. The formula implies that if we can estimate these conditional distributions of the predictors, we can develop a powerful decision rule. However, this is a big <em>if</em>. As we go forward, we will encounter examples in which <span class="math inline">\(\mathbf{X}\)</span> has many dimensions and we do not have much information about the distribution. In these cases, Naive Bayes will be practically impossible to implement. However, there are instances in which we have a small number of predictors (not much more than 2) and many categories in which generative models can be quite powerful. We describe two specific examples and use our previously described case studies to illustrate them.</p>
+<p>with <span class="math inline">\(f_{\mathbf{X}|Y = 1}\)</span> and <span class="math inline">\(f_{\mathbf{X}|Y = 0}\)</span> representing the distribution functions of the predictor <span class="math inline">\(\mathbf{X}\)</span> for the two classes <span class="math inline">\(Y = 1\)</span> and <span class="math inline">\(Y = 0\)</span>. The formula implies that if we can estimate these conditional distributions of the predictors, we can develop a powerful decision rule. However, this is a big <em>if</em>.</p>
+<p>As we go forward, we will encounter examples in which <span class="math inline">\(\mathbf{X}\)</span> has many dimensions and we do not have much information about the distribution. In these cases, Naive Bayes will be practically impossible to implement. However, there are instances in which we have a small number of predictors (not much more than 2) and many categories in which generative models can be quite powerful. We describe two specific examples and use our previously described case studies to illustrate them.</p>
 <p>Let’s start with a very simple and uninteresting, yet illustrative, case: the example related to predicting sex from height.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-25_7ea12994d0b3bf11862b0620ac37ec89">
-<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
-<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span>
-<span></span>
-<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
-<span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-6_fa7482493281bcc736a4c00f95a3e866">
+<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1995</span><span class="op">)</span></span>
 <span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">heights</span><span class="op">$</span><span class="va">height</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1995</span><span class="op">)</span></span>
 <span><span class="va">test_index</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/createDataPartition.html">createDataPartition</a></span><span class="op">(</span><span class="va">y</span>, times <span class="op">=</span> <span class="fl">1</span>, p <span class="op">=</span> <span class="fl">0.5</span>, list <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
 <span><span class="va">train_set</span> <span class="op">&lt;-</span> <span class="va">heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/slice.html">slice</a></span><span class="op">(</span><span class="op">-</span><span class="va">test_index</span><span class="op">)</span></span>
 <span><span class="va">test_set</span> <span class="op">&lt;-</span> <span class="va">heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/slice.html">slice</a></span><span class="op">(</span><span class="va">test_index</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>In this case, the Naive Bayes approach is particularly appropriate because we know that the normal distribution is a good approximation for the conditional distributions of height given sex for both classes <span class="math inline">\(Y = 1\)</span> (female) and <span class="math inline">\(Y = 0\)</span> (male). This implies that we can approximate the conditional distributions <span class="math inline">\(f_{X|Y = 1}\)</span> and <span class="math inline">\(f_{X|Y = 0}\)</span> by simply estimating averages and standard deviations from the data:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-26_1de5ef4acd760318682a26201ac2e593">
-<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">params</span> <span class="op">&lt;-</span> <span class="va">train_set</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">sex</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>avg <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span>, sd <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-7_2c3cf2ee410b24676282bbd9ac8917d4">
+<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">params</span> <span class="op">&lt;-</span> <span class="va">train_set</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">sex</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>avg <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span>, sd <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">params</span></span>
 <span><span class="co">#&gt; # A tibble: 2 × 3</span></span>
 <span><span class="co">#&gt;   sex      avg    sd</span></span>
@@ -822,21 +551,19 @@ <h1 class="title">
 <span><span class="co">#&gt; 2 Male    69.2  3.57</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The prevalence, which we will denote with <span class="math inline">\(\pi = \mbox{Pr}(Y = 1)\)</span>, can be estimated from the data with:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-27_c72aa71b42f18d5cf24941d1982e67a4">
-<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pi</span> <span class="op">&lt;-</span> <span class="va">train_set</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>pi <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">sex</span> <span class="op">==</span> <span class="st">"Female"</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">pi</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-8_18e5ea17becc154b16bb57945ed5e6b6">
+<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pi</span> <span class="op">&lt;-</span> <span class="va">train_set</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>pi <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">sex</span> <span class="op">==</span> <span class="st">"Female"</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">pi</span><span class="op">)</span></span>
 <span><span class="va">pi</span></span>
 <span><span class="co">#&gt; [1] 0.212</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Now we can use our estimates of average and standard deviation to get an actual rule:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-28_3bb8651c4fb669e1455b013f53c82d67">
-<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">test_set</span><span class="op">$</span><span class="va">height</span></span>
-<span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-9_088e97768ee006335653d86ee4952bad">
+<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">test_set</span><span class="op">$</span><span class="va">height</span></span>
 <span><span class="va">f0</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">dnorm</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">params</span><span class="op">$</span><span class="va">avg</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span>, <span class="va">params</span><span class="op">$</span><span class="va">sd</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span><span class="op">)</span></span>
 <span><span class="va">f1</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">dnorm</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">params</span><span class="op">$</span><span class="va">avg</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span>, <span class="va">params</span><span class="op">$</span><span class="va">sd</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span></span>
-<span></span>
 <span><span class="va">p_hat_bayes</span> <span class="op">&lt;-</span> <span class="va">f1</span><span class="op">*</span><span class="va">pi</span> <span class="op">/</span> <span class="op">(</span><span class="va">f1</span><span class="op">*</span><span class="va">pi</span> <span class="op">+</span> <span class="va">f0</span><span class="op">*</span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">pi</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Our Naive Bayes estimate <span class="math inline">\(\hat{p}(x)\)</span> looks a lot like our logistic regression estimate:</p>
+<p>Our Naive Bayes estimate <span class="math inline">\(\hat{p}(x)\)</span> looks a lot like a logistic regression estimate:</p>
 <div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/conditional-prob-glm-fit-2_d65ca24650b2f7336eec1ee57bbecfdc">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -845,43 +572,44 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>In fact, we can show that the Naive Bayes approach is similar to the logistic regression prediction mathematically. However, we leave the demonstration to a more advanced text, such as the Elements of Statistical Learning<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>. We can see that they are similar empirically by comparing the two resulting curves.</p>
-</section><section id="controlling-prevalence" class="level3" data-number="29.4.2"><h3 data-number="29.4.2" class="anchored" data-anchor-id="controlling-prevalence">
-<span class="header-section-number">29.4.2</span> Controlling prevalence</h3>
+<p>In fact, we can show that the Naive Bayes approach is similar to the logistic regression prediction mathematically. However, we leave the demonstration to a more advanced text, such as the Elements of Statistical Learning<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>. We can see that they are similar empirically by comparing the two resulting curves.</p>
+</section><section id="controlling-prevalence" class="level3" data-number="30.3.2"><h3 data-number="30.3.2" class="anchored" data-anchor-id="controlling-prevalence">
+<span class="header-section-number">30.3.2</span> Controlling prevalence</h3>
 <p>One useful feature of the Naive Bayes approach is that it includes a parameter to account for differences in prevalence. Using our sample, we estimated <span class="math inline">\(f_{X|Y = 1}\)</span>, <span class="math inline">\(f_{X|Y = 0}\)</span> and <span class="math inline">\(\pi\)</span>. If we use hats to denote the estimates, we can write <span class="math inline">\(\hat{p}(x)\)</span> as:</p>
 <p><span class="math display">\[
 \hat{p}(x)= \frac{\hat{f}_{X|Y = 1}(x) \hat{\pi}}
 { \hat{f}_{X|Y = 0}(x)(1-\hat{\pi}) + \hat{f}_{X|Y = 1}(x)\hat{\pi} }
 \]</span></p>
-<p>As we discussed earlier, our sample has a much lower prevalence, 0.21, than the general population. So if we use the rule <span class="math inline">\(\hat{p}(x)&gt;0.5\)</span> to predict females, our accuracy will be affected due to the low sensitivity:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-29_3e08f05612bbe1dafb7939a86988f277">
-<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_bayes</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">p_hat_bayes</span> <span class="op">&gt;</span> <span class="fl">0.5</span>, <span class="st">"Female"</span>, <span class="st">"Male"</span><span class="op">)</span></span>
+<p>As we discussed earlier, our sample has a much lower prevalence, 0.21, than the general population. So if we use the rule <span class="math inline">\(\hat{p}(x) &gt; 0.5\)</span> to predict females, our accuracy will be affected due to the low sensitivity:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-10_e9622191aeca4c5bf190787163ebb115">
+<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_bayes</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">p_hat_bayes</span> <span class="op">&gt;</span> <span class="fl">0.5</span>, <span class="st">"Female"</span>, <span class="st">"Male"</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/sensitivity.html">sensitivity</a></span><span class="op">(</span>data <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y_hat_bayes</span><span class="op">)</span>, reference <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.213</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Again, this is because the algorithm gives more weight to specificity to account for the low prevalence:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-30_adb07e9697788ed604163fa2f842ab73">
-<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/sensitivity.html">specificity</a></span><span class="op">(</span>data <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y_hat_bayes</span><span class="op">)</span>, reference <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-11_f2273d1811d077886c45c6e5f8b4edd4">
+<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/sensitivity.html">specificity</a></span><span class="op">(</span>data <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y_hat_bayes</span><span class="op">)</span>, reference <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.967</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This is due mainly to the fact that <span class="math inline">\(\hat{\pi}\)</span> is substantially less than 0.5, so we tend to predict <code>Male</code> more often. It makes sense for a machine learning algorithm to do this in our sample because we do have a higher percentage of males. But if we were to extrapolate this to a general population, our overall accuracy would be affected by the low sensitivity.</p>
+<p>This is mainly due to the fact that <span class="math inline">\(\hat{\pi}\)</span> is substantially less than 0.5, so we tend to predict <code>Male</code> more often. It makes sense for a machine learning algorithm to do this in our sample because we do have a higher percentage of males. But if we were to extrapolate this to a general population, our overall accuracy would be affected by the low sensitivity.</p>
 <p>The Naive Bayes approach gives us a direct way to correct this since we can simply force <span class="math inline">\(\hat{\pi}\)</span> to be whatever value we want it to be. So to balance specificity and sensitivity, instead of changing the cutoff in the decision rule, we could simply change <span class="math inline">\(\hat{\pi}\)</span> to 0.5 like this:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-31_17550dfbb6ef1cd2a5ad199bdc927f7f">
-<div class="sourceCode" id="cb33"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p_hat_bayes_unbiased</span> <span class="op">&lt;-</span> <span class="va">f1</span> <span class="op">*</span> <span class="fl">0.5</span> <span class="op">/</span> <span class="op">(</span><span class="va">f1</span> <span class="op">*</span> <span class="fl">0.5</span> <span class="op">+</span> <span class="va">f0</span> <span class="op">*</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="fl">0.5</span><span class="op">)</span><span class="op">)</span> </span>
-<span><span class="va">y_hat_bayes_unbiased</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">p_hat_bayes_unbiased</span><span class="op">&gt;</span> <span class="fl">0.5</span>, <span class="st">"Female"</span>, <span class="st">"Male"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-12_3ff5b97119638c0ebf9bc7785285043b">
+<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p_hat_bayes_unbiased</span> <span class="op">&lt;-</span> <span class="va">f1</span> <span class="op">*</span> <span class="fl">0.5</span> <span class="op">/</span> <span class="op">(</span><span class="va">f1</span> <span class="op">*</span> <span class="fl">0.5</span> <span class="op">+</span> <span class="va">f0</span> <span class="op">*</span> <span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="fl">0.5</span><span class="op">)</span><span class="op">)</span> </span>
+<span><span class="va">y_hat_bayes_unbiased</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">p_hat_bayes_unbiased</span> <span class="op">&gt;</span> <span class="fl">0.5</span>, <span class="st">"Female"</span>, <span class="st">"Male"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Note the difference in sensitivity with a better balance:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-32_ce34b091e0c2605176f2474d8651fea2">
-<div class="sourceCode" id="cb34"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/sensitivity.html">sensitivity</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y_hat_bayes_unbiased</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-13_4a2c1303bb4fd890713b7af75c6b3bd6">
+<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/sensitivity.html">sensitivity</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y_hat_bayes_unbiased</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.693</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/sensitivity.html">specificity</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y_hat_bayes_unbiased</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.832</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The new rule also gives us a very intuitive cutoff between 66-67, which is about the middle of the female and male average heights:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/naive-with-good-prevalence_3160c74ce5f38de9d04a6992968417c4">
-<div class="sourceCode" id="cb35"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/qplot.html">qplot</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">p_hat_bayes_unbiased</span>, geom <span class="op">=</span> <span class="st">"line"</span><span class="op">)</span> <span class="op">+</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_abline.html">geom_hline</a></span><span class="op">(</span>yintercept <span class="op">=</span> <span class="fl">0.5</span>, lty <span class="op">=</span> <span class="fl">2</span><span class="op">)</span> <span class="op">+</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_abline.html">geom_vline</a></span><span class="op">(</span>xintercept <span class="op">=</span> <span class="fl">67</span>, lty <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/naive-with-good-prevalence_b62ccde421548d51e5d9664c841d07d8">
+<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">p_hat_bayes_unbiased</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/graphics/abline.html">abline</a></span><span class="op">(</span>h <span class="op">=</span> <span class="fl">0.5</span>, lty <span class="op">=</span> <span class="fl">2</span><span class="op">)</span> <span class="op">+</span> </span>
+<span><span class="fu"><a href="https://rdrr.io/r/graphics/abline.html">abline</a></span><span class="op">(</span>v <span class="op">=</span> <span class="fl">67</span>, lty <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span>
+<span><span class="co">#&gt; integer(0)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/naive-with-good-prevalence-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -889,12 +617,12 @@ <h1 class="title">
 </div>
 </div>
 </div>
-</section><section id="quadratic-discriminant-analysis" class="level3" data-number="29.4.3"><h3 data-number="29.4.3" class="anchored" data-anchor-id="quadratic-discriminant-analysis">
-<span class="header-section-number">29.4.3</span> Quadratic discriminant analysis</h3>
+</section><section id="quadratic-discriminant-analysis" class="level3" data-number="30.3.3"><h3 data-number="30.3.3" class="anchored" data-anchor-id="quadratic-discriminant-analysis">
+<span class="header-section-number">30.3.3</span> Quadratic discriminant analysis</h3>
 <p>Quadratic Discriminant Analysis (QDA) is a version of Naive Bayes in which we assume that the distributions <span class="math inline">\(p_{\mathbf{X}|Y = 1}(x)\)</span> and <span class="math inline">\(p_{\mathbf{X}|Y = 0}(\mathbf{x})\)</span> are multivariate normal. The simple example we described in the previous section is actually QDA. Let’s now look at a slightly more complicated case: the 2 or 7 example.</p>
-<p>In this case, we have two predictors so we assume each one is bivariate normal. This implies that we need to estimate two averages, two standard deviations, and a correlation for each case <span class="math inline">\(Y = 1\)</span> and <span class="math inline">\(Y = 0\)</span>. Once we have these, we can approximate the distributions <span class="math inline">\(f_{X_1,X_2|Y = 1}\)</span> and <span class="math inline">\(f_{X_1, X_2|Y = 0}\)</span>. We can easily estimate parameters from the data:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-34_866d21fe07898b0fdf50e2f86cd30877">
-<div class="sourceCode" id="cb36"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">params</span> <span class="op">&lt;-</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span> <span class="op">|&gt;</span> </span>
+<p>In this example, we have two predictors so we assume each one is bivariate normal. This implies that we need to estimate two averages, two standard deviations, and a correlation for each case <span class="math inline">\(Y = 1\)</span> and <span class="math inline">\(Y = 0\)</span>. Once we have these, we can approximate the distributions <span class="math inline">\(f_{X_1,X_2|Y = 1}\)</span> and <span class="math inline">\(f_{X_1, X_2|Y = 0}\)</span>. We can easily estimate parameters from the data:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-14_8f6b94e350cd0cd111f0c5c72f3e6e82">
+<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">params</span> <span class="op">&lt;-</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>avg_1 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x_1</span><span class="op">)</span>, avg_2 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x_2</span><span class="op">)</span>, </span>
 <span>            sd_1<span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">x_1</span><span class="op">)</span>, sd_2 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">x_2</span><span class="op">)</span>, </span>
@@ -903,15 +631,16 @@ <h1 class="title">
 <span><span class="co">#&gt; # A tibble: 2 × 6</span></span>
 <span><span class="co">#&gt;   y     avg_1 avg_2   sd_1   sd_2     r</span></span>
 <span><span class="co">#&gt;   &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt;  &lt;dbl&gt;  &lt;dbl&gt; &lt;dbl&gt;</span></span>
-<span><span class="co">#&gt; 1 2     0.129 0.283 0.0702 0.0578 0.401</span></span>
-<span><span class="co">#&gt; 2 7     0.234 0.288 0.0719 0.105  0.455</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt; 1 2     0.136 0.287 0.0670 0.0600 0.415</span></span>
+<span><span class="co">#&gt; 2 7     0.238 0.290 0.0745 0.104  0.468</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Here we provide a visual way of showing the approach. We plot the data and use contour plots to give an idea of what the two estimated normal densities look like (we show the curve representing a region that includes 95% of the points):</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/qda-explained_73dde5d85049ff203be33d4670bdcf5d">
-<div class="sourceCode" id="cb37"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>y <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span>, fill <span class="op">=</span> <span class="va">y</span>, color <span class="op">=</span> <span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span>show.legend <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span> <span class="op">+</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/stat_ellipse.html">stat_ellipse</a></span><span class="op">(</span>type <span class="op">=</span> <span class="st">"norm"</span>, lwd <span class="op">=</span> <span class="fl">1.5</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>With these estimates in place, all we need are the prevalence <span class="math inline">\(\pi\)</span> to compute:</p>
+<p><span class="math display">\[
+\hat{p}(\mathbf{x})= \frac{\hat{f}_{\mathbf{X}|Y = 1}(\mathbf{x}) \hat{\pi}}
+{ \hat{f}_{\mathbf{X}|Y = 0}(x)(1-\hat{\pi}) + \hat{f}_{\mathbf{X}|Y = 1}(\mathbf{x})\hat{\pi} }
+\]</span></p>
+<p>Note that the densities <span class="math inline">\(f\)</span> are bivariate normal distributions. Here we provide a visual way of showing the approach. We plot the data and use contour plots to give an idea of what the two estimated normal densities look like (we show the curve representing a region that includes 95% of the points):</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/qda-explained_5ff9f7ef09d55bdcadd19a0a5ea7d5a0">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/qda-explained-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -919,21 +648,19 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>This defines the following estimate of <span class="math inline">\(f(x_1, x_2)\)</span>.</p>
-<p>We can use the <code>train</code> function from the <strong>caret</strong> package to fit the model and obtain predictors:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-35_dd5162b4b2858a3074467b0ac19193ff">
-<div class="sourceCode" id="cb38"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span>
-<span><span class="va">train_qda</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"qda"</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>We can fit QDA using the <code>qda</code> function the <strong>MASS</strong> package:</p>
+<div class="cell" data-layout-align="center">
+<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_qda</span> <span class="op">&lt;-</span> <span class="fu">MASS</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/MASS/man/qda.html">qda</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">)</span></span>
+<span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_qda</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">)</span><span class="op">$</span><span class="va">class</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We see that we obtain relatively good accuracy:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-36_125f155ae5b2abae3269dbd45d0c15e5">
-<div class="sourceCode" id="cb39"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_qda</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-16_7ee7af698187f0657c73a69a8f8d747b">
+<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span> </span>
 <span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;     0.82</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt;    0.815</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The estimated conditional probability looks relatively good, although it does not fit as well as the kernel smoothers:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/qda-estimate_4113e1cea6f5677daa049dc952c35df2">
+<p>The conditional probability looks relatively good, although it does not fit as well as the kernel smoothers:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/qda-estimate_d6f69659cd02797d654a559903add7ef">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/qda-estimate-1.png" class="img-fluid figure-img" style="width:100.0%"></p>
@@ -941,9 +668,9 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>One reason QDA does not work as well as the kernel methods is perhaps because the assumption of normality does not quite hold. Although for the 2s it seems reasonable, for the 7s it does seem to be off. Notice the slight curvature in the points for the 7s:</p>
+<p>One reason QDA does not work as well as the kernel methods is because the assumption of normality does not quite hold. Although for the 2s it seems reasonable, for the 7s it does seem to be off. Notice the slight curvature in the points for the 7s:</p>
 <div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/qda-does-not-fit_639be56b2b9ff7c5e1e9fbad75deb0ae">
-<div class="sourceCode" id="cb40"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>y <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>y <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span>, fill <span class="op">=</span> <span class="va">y</span>, color <span class="op">=</span> <span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span>show.legend <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span> <span class="op">+</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/stat_ellipse.html">stat_ellipse</a></span><span class="op">(</span>type <span class="op">=</span> <span class="st">"norm"</span><span class="op">)</span> <span class="op">+</span></span>
@@ -955,34 +682,11 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>QDA can work well here, but it becomes harder to use as the number of predictors increases. Here we have 2 predictors and had to compute 4 means, 4 SDs, and 2 correlations. How many parameters would we have if instead of 2 predictors, we had 10? The main problem comes from estimating correlations for 10 predictors. With 10, we have 45 correlations for each class. In general, the formula is <span class="math inline">\(K\times p(p-1)/2\)</span>, which gets big fast. Once the number of parameters approaches the size of our data, the method becomes impractical due to overfitting.</p>
-</section><section id="linear-discriminant-analysis" class="level3" data-number="29.4.4"><h3 data-number="29.4.4" class="anchored" data-anchor-id="linear-discriminant-analysis">
-<span class="header-section-number">29.4.4</span> Linear discriminant analysis</h3>
-<p>A relatively simple solution to the problem of having too many parameters is to assume that the correlation structure is the same for all classes, which reduces the number of parameters we need to estimate.</p>
-<p>In this case, we would compute just one pair of standard deviations and one correlation, <!--so the parameters would look something like this:
-
-
-::: {.cell layout-align="center" hash='algorithms_cache/html/unnamed-chunk-38_5973b62bcb536771680777128a9855c2'}
-
-```{.r .cell-code}
-params <- mnist_27$train |> 
-  group_by(y) |> 
-  summarize(avg_1 = mean(x_1), avg_2 = mean(x_2), 
-            sd_1= sd(x_1), sd_2 = sd(x_2), 
-            r = cor(x_1,x_2))
-
-params <- params |> mutate(sd_1 = mean(sd_1), sd_2 = mean(sd_2), r = mean(r))
-params 
-#> # A tibble: 2 × 6
-#>   y     avg_1 avg_2   sd_1   sd_2     r
-#>   <fct> <dbl> <dbl>  <dbl>  <dbl> <dbl>
-#> 1 2     0.129 0.283 0.0710 0.0813 0.428
-#> 2 7     0.234 0.288 0.0710 0.0813 0.428
-```
-:::
-
---> and the distributions looks like this:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/lda-explained_7c827e80bb6fe793322f6a1e48f75b3c">
+<p>QDA can work well here, but it becomes harder to use as the number of predictors increases. Here we have 2 predictors and had to compute 4 means, 4 SDs, and 2 correlations. Notice that if we have 10 predictors, we have 45 correlations for each class. In general, the formula is <span class="math inline">\(K\times p(p-1)/2\)</span>, which gets big fast. Once the number of parameters approaches the size of our data, the method becomes impractical due to overfitting.</p>
+</section><section id="linear-discriminant-analysis" class="level3" data-number="30.3.4"><h3 data-number="30.3.4" class="anchored" data-anchor-id="linear-discriminant-analysis">
+<span class="header-section-number">30.3.4</span> Linear discriminant analysis</h3>
+<p>A relatively simple solution to QDA’s problem of having too many parameters is to assume that the correlation structure is the same for all classes, which reduces the number of parameters we need to estimate. In this case, the the distributions looks like this:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/lda-explained_5bc0e769247f6bf2c3ca47730c60a1f9">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/lda-explained-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -990,29 +694,15 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Now the size of the ellipses as well as the angle are the same. This is because they have the same standard deviations and correlations.</p>
-<p>We can fit the LDA model using <strong>caret</strong>:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-39_f15a47c5de3e7bbd8201e11e484673d3">
-<div class="sourceCode" id="cb41"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_lda</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"lda"</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">)</span></span>
-<span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_lda</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
+<p>We can LDA using the <strong>MASS</strong> <code>lda</code> function:</p>
+<p>Now the size of the ellipses as well as the angles are the same. This is because they are assumed to have the same standard deviations and correlations. Although this added constraint lowers the number of parameters, the rigidity lowers our accuracy to:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-18_7c527afd9bc14960b7d82404460dfa68">
+<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
 <span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;     0.75</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt;    0.775</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>When we force this assumption, we can show mathematically that the boundary is a line, just as with logistic regression. For this reason, we call the method <em>linear</em> discriminant analysis (LDA). Similarly, for QDA, we can show that the boundary must be a quadratic function.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/lda-estimate_a882a89191003df94546a80e693f84b1">
-<pre><code>#&gt; Warning: The following aesthetics were dropped during statistical
-#&gt; transformation: fill
-#&gt; ℹ This can happen when ggplot fails to infer the correct grouping
-#&gt;   structure in the data.
-#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a
-#&gt;   numerical variable into a factor?
-#&gt; The following aesthetics were dropped during statistical
-#&gt; transformation: fill
-#&gt; ℹ This can happen when ggplot fails to infer the correct grouping
-#&gt;   structure in the data.
-#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a
-#&gt;   numerical variable into a factor?</code></pre>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/lda-estimate_fe3abdf5a32c8bab96981387be2d0762">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/lda-estimate-1.png" class="img-fluid figure-img" style="width:100.0%"></p>
@@ -1021,139 +711,34 @@ <h1 class="title">
 </div>
 </div>
 <p>In the case of LDA, the lack of flexibility does not permit us to capture the non-linearity in the true conditional probability function.</p>
-</section><section id="connection-to-distance" class="level3" data-number="29.4.5"><h3 data-number="29.4.5" class="anchored" data-anchor-id="connection-to-distance">
-<span class="header-section-number">29.4.5</span> Connection to distance</h3>
+</section><section id="connection-to-distance" class="level3" data-number="30.3.5"><h3 data-number="30.3.5" class="anchored" data-anchor-id="connection-to-distance">
+<span class="header-section-number">30.3.5</span> Connection to distance</h3>
 <p>The normal density is:</p>
 <p><span class="math display">\[
-p(x) = \frac{1}{\sqrt{2\pi} \sigma} \exp\left\{ - \frac{(x-\mu)^2}{\sigma^2}\right\}
+f(x) = \frac{1}{\sqrt{2\pi} \sigma} \exp\left\{ - \frac{(x-\mu)^2}{\sigma^2}\right\}
 \]</span></p>
 <p>If we remove the constant <span class="math inline">\(1/(\sqrt{2\pi} \sigma)\)</span> and then take the log, we get:</p>
 <p><span class="math display">\[
 - \frac{(x-\mu)^2}{\sigma^2}
 \]</span></p>
 <p>which is the negative of a distance squared scaled by the standard deviation. For higher dimensions, the same is true except the scaling is more complex and involves correlations.</p>
-</section><section id="case-study-more-than-three-classes" class="level3" data-number="29.4.6"><h3 data-number="29.4.6" class="anchored" data-anchor-id="case-study-more-than-three-classes">
-<span class="header-section-number">29.4.6</span> Case study: more than three classes</h3>
-<p>We can generate an example with three categories like this:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-40_42f2e689a537937eaf28eb4aaee296ff">
-<div class="sourceCode" id="cb43"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw">if</span> <span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/exists.html">exists</a></span><span class="op">(</span><span class="st">"mnist"</span><span class="op">)</span><span class="op">)</span> <span class="va">mnist</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/dslabs/man/read_mnist.html">read_mnist</a></span><span class="op">(</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">3456</span><span class="op">)</span></span>
-<span><span class="va">index_127</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/which.html">which</a></span><span class="op">(</span><span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">labels</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span>,<span class="fl">2</span>,<span class="fl">7</span><span class="op">)</span><span class="op">)</span>, <span class="fl">2000</span><span class="op">)</span></span>
-<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">labels</span><span class="op">[</span><span class="va">index_127</span><span class="op">]</span> </span>
-<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">images</span><span class="op">[</span><span class="va">index_127</span>,<span class="op">]</span></span>
-<span><span class="va">index_train</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/createDataPartition.html">createDataPartition</a></span><span class="op">(</span><span class="va">y</span>, p <span class="op">=</span> <span class="fl">0.8</span>, list <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
-<span><span class="co">## get the quadrants</span></span>
-<span><span class="va">row_column</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/expand.grid.html">expand.grid</a></span><span class="op">(</span>row <span class="op">=</span> <span class="fl">1</span><span class="op">:</span><span class="fl">28</span>, col <span class="op">=</span> <span class="fl">1</span><span class="op">:</span><span class="fl">28</span><span class="op">)</span> </span>
-<span><span class="va">upper_left_ind</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/which.html">which</a></span><span class="op">(</span><span class="va">row_column</span><span class="op">$</span><span class="va">col</span> <span class="op">&lt;=</span> <span class="fl">14</span> <span class="op">&amp;</span> <span class="va">row_column</span><span class="op">$</span><span class="va">row</span> <span class="op">&lt;=</span> <span class="fl">14</span><span class="op">)</span></span>
-<span><span class="va">lower_right_ind</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/which.html">which</a></span><span class="op">(</span><span class="va">row_column</span><span class="op">$</span><span class="va">col</span> <span class="op">&gt;</span> <span class="fl">14</span> <span class="op">&amp;</span> <span class="va">row_column</span><span class="op">$</span><span class="va">row</span> <span class="op">&gt;</span> <span class="fl">14</span><span class="op">)</span></span>
-<span><span class="co">## binarize the values. Above 200 is ink, below is no ink</span></span>
-<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">x</span> <span class="op">&gt;</span> <span class="fl">200</span> </span>
-<span><span class="co">## proportion of pixels in lower right quadrant</span></span>
-<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/cbind.html">cbind</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowSums</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span> ,<span class="va">upper_left_ind</span><span class="op">]</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowSums</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, </span>
-<span>           <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowSums</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span> ,<span class="va">lower_right_ind</span><span class="op">]</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowSums</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span> </span>
-<span><span class="co">##save data</span></span>
-<span><span class="va">train_set</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>y <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y</span><span class="op">[</span><span class="va">index_train</span><span class="op">]</span><span class="op">)</span>,</span>
-<span>                        x_1 <span class="op">=</span> <span class="va">x</span><span class="op">[</span><span class="va">index_train</span>,<span class="fl">1</span><span class="op">]</span>, x_2 <span class="op">=</span> <span class="va">x</span><span class="op">[</span><span class="va">index_train</span>,<span class="fl">2</span><span class="op">]</span><span class="op">)</span></span>
-<span><span class="va">test_set</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>y <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y</span><span class="op">[</span><span class="op">-</span><span class="va">index_train</span><span class="op">]</span><span class="op">)</span>,</span>
-<span>                       x_1 <span class="op">=</span> <span class="va">x</span><span class="op">[</span><span class="op">-</span><span class="va">index_train</span>,<span class="fl">1</span><span class="op">]</span>, x_2 <span class="op">=</span> <span class="va">x</span><span class="op">[</span><span class="op">-</span><span class="va">index_train</span>,<span class="fl">2</span><span class="op">]</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Here is the training data:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/mnist-27-training-data_54e214b23903f78eff87bf7fb2c40bcc">
-<div class="sourceCode" id="cb44"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_set</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span>, color <span class="op">=</span> <span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/mnist-27-training-data-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>We can use the <strong>caret</strong> package to train the QDA model:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-41_f88e7bb06766943cb33a0511fdb51706">
-<div class="sourceCode" id="cb45"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_qda</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"qda"</span>, data <span class="op">=</span> <span class="va">train_set</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Now we estimate three conditional probabilities (although they have to add to 1):</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-42_6f6cee24910fcff2a990f1e3049b68d1">
-<div class="sourceCode" id="cb46"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_qda</span>, <span class="va">test_set</span>, type <span class="op">=</span> <span class="st">"prob"</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/utils/head.html">head</a></span><span class="op">(</span><span class="op">)</span></span>
-<span><span class="co">#&gt;        1       2       7</span></span>
-<span><span class="co">#&gt; 1 0.7655 0.23043 0.00405</span></span>
-<span><span class="co">#&gt; 2 0.2031 0.72514 0.07175</span></span>
-<span><span class="co">#&gt; 3 0.5396 0.45909 0.00132</span></span>
-<span><span class="co">#&gt; 4 0.0393 0.09419 0.86655</span></span>
-<span><span class="co">#&gt; 5 0.9600 0.00936 0.03063</span></span>
-<span><span class="co">#&gt; 6 0.9865 0.00724 0.00623</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Our predictions are one of the three classes:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-43_ce07e6efc7f26d5f708d4704b08fcfc2">
-<div class="sourceCode" id="cb47"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_qda</span>, <span class="va">test_set</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/utils/head.html">head</a></span><span class="op">(</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 1 2 1 7 1 1</span></span>
-<span><span class="co">#&gt; Levels: 1 2 7</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>The confusion matrix is therefore a 3 by 3 table:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-44_4efede2b7f2de5f96188c9b0b3c7e76f">
-<div class="sourceCode" id="cb48"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_qda</span>, <span class="va">test_set</span><span class="op">)</span>, <span class="va">test_set</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">table</span></span>
-<span><span class="co">#&gt;           Reference</span></span>
-<span><span class="co">#&gt; Prediction   1   2   7</span></span>
-<span><span class="co">#&gt;          1 111   9  11</span></span>
-<span><span class="co">#&gt;          2  10  86  21</span></span>
-<span><span class="co">#&gt;          7  21  28 102</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>The accuracy is 0.7493734</p>
-<p>Note that for sensitivity and specificity, we have a pair of values for <strong>each</strong> class. To define these terms, we need a binary outcome. We therefore have three columns: one for each class as the positives and the other two as the negatives.</p>
-<p>To visualize what parts of the region are called 1, 2, and 7 we now need three colors:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/three-classes-plot_40cd738d5ba20089508dd963b04a0f10">
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/three-classes-plot-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>The accuracy for LDA, 0.6290727, is much worse because the model is more rigid. This is what the decision rule looks like:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/lda-too-rigid_b6d120f7af92bd97468fd509877b47b1">
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/lda-too-rigid-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>The results for kNN</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-46_a14a4393507c3126e54d025616bc227c">
-<div class="sourceCode" id="cb49"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"knn"</span>, data <span class="op">=</span> <span class="va">train_set</span>,</span>
-<span>                   tuneGrid <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>k <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">15</span>, <span class="fl">51</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>are much better with an accuracy of 0.7493734. The decision rule looks like this:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/three-classes-knn-better_dbf04b7fca9a9dbc958024872ca12dda">
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/three-classes-knn-better-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
 </div>
-<p>Note that one of the limitations of generative models here is due to the lack of fit of the normal assumption, in particular for class 1.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/three-classes-lack-of-fit_5a389e7f482dcf89fb821ba067868e63">
-<div class="sourceCode" id="cb50"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_set</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>y <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span>, fill <span class="op">=</span> <span class="va">y</span>, color <span class="op">=</span> <span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span>show.legend <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span> <span class="op">+</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/stat_ellipse.html">stat_ellipse</a></span><span class="op">(</span>type <span class="op">=</span> <span class="st">"norm"</span><span class="op">)</span> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/three-classes-lack-of-fit-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
+<div class="callout-body-container">
+<p>You are now ready to do exercises 14-22.</p>
 </div>
 </div>
 </div>
-<p>Generative models can be very powerful, but only when we are able to successfully approximate the joint distribution of predictors conditioned on each class.</p>
-<p>:::{.callout-note}</p>
-<p>You are now ready to do exercises 14-22.</p>
-</section></section><section id="sec-trees" class="level2" data-number="29.5"><h2 data-number="29.5" class="anchored" data-anchor-id="sec-trees">
-<span class="header-section-number">29.5</span> Classification and regression trees (CART)</h2>
-<section id="the-curse-of-dimensionality" class="level3" data-number="29.5.1"><h3 data-number="29.5.1" class="anchored" data-anchor-id="the-curse-of-dimensionality">
-<span class="header-section-number">29.5.1</span> The curse of dimensionality</h3>
-<p>We described how methods such as LDA and QDA are not meant to be used with many predictors <span class="math inline">\(p\)</span> because the number of parameters that we need to estimate becomes too large. For example, with the digits example <span class="math inline">\(p = 784\)</span>, we would have over 600,000 parameters with LDA, and we would multiply that by the number of classes for QDA. Kernel methods such as kNN or local regression do not have model parameters to estimate. However, they also face a challenge when multiple predictors are used due to what is referred to as the <em>curse of dimensionality</em>. The <em>dimension</em> here refers to the fact that when we have <span class="math inline">\(p\)</span> predictors, the distance between two observations is computed in <span class="math inline">\(p\)</span>-dimensional space.</p>
-<p>A useful way of understanding the curse of dimensionality is by considering how large we have to make a span/neighborhood/window to include a given percentage of the data. Remember that with larger neighborhoods, our methods lose flexibility.</p>
-<p>For example, suppose we have one continuous predictor with equally spaced points in the [0,1] interval and we want to create windows that include 1/10th of data. Then it’s easy to see that our windows have to be of size 0.1:</p>
+</section></section><section id="sec-trees" class="level2" data-number="30.4"><h2 data-number="30.4" class="anchored" data-anchor-id="sec-trees">
+<span class="header-section-number">30.4</span> Classification and regression trees (CART)</h2>
+<section id="the-curse-of-dimensionality" class="level3" data-number="30.4.1"><h3 data-number="30.4.1" class="anchored" data-anchor-id="the-curse-of-dimensionality">
+<span class="header-section-number">30.4.1</span> The curse of dimensionality</h3>
+<p>We described how methods such as LDA and QDA are not meant to be used with many predictors <span class="math inline">\(p\)</span> because the number of parameters that we need to estimate becomes too large. For example, with the digits example <span class="math inline">\(p = 784\)</span>, we would have over 600,000 parameters with LDA, and we would multiply that by the number of classes for QDA. Kernel methods, such as kNN or local regression, do not have model parameters to estimate. However, they also face a challenge when multiple predictors are used due to what is referred to as the <em>curse of dimensionality</em>. The <em>dimension</em> here refers to the fact that when we have <span class="math inline">\(p\)</span> predictors, the distance between two observations is computed in <span class="math inline">\(p\)</span>-dimensional space.</p>
+<p>A useful way of understanding the curse of dimensionality is by considering how large we have to make a span/neighborhood/window to include a given percentage of the data. Remember that with larger neighborhoods, our methods lose flexibility, and to be flexible we need to keep the neighborhoods small.</p>
+<p>To see how this becomes an issue for higher dimensions, suppose we have one continuous predictor with equally spaced points in the [0,1] interval and we want to create windows that include 1/10th of data. Then it’s easy to see that our windows have to be of size 0.1:</p>
 <div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/curse-of-dim_53e07df1dd824f4ab53f19f859f18c18">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -1162,7 +747,7 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Now, for two predictors, if we decide to keep the neighborhood just as small, 10% for each dimension, we include only 1 point. If we want to include 10% of the data, then we need to increase the size of each side of the square to <span class="math inline">\(\sqrt{.10} \approx .316\)</span>:</p>
+<p>Now for two predictors, if we decide to keep the neighborhood just as small, 10% for each dimension, we include only 1 point. If we want to include 10% of the data, then we need to increase the size of each side of the square to <span class="math inline">\(\sqrt{.10} \approx .316\)</span>:</p>
 <div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/curse-of-dim-2_cad44369440b4a90fa78d4191f20d143">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -1172,10 +757,7 @@ <h1 class="title">
 </div>
 </div>
 <p>Using the same logic, if we want to include 10% of the data in a three-dimensional space, then the side of each cube is <span class="math inline">\(\sqrt[3]{.10} \approx 0.464\)</span>. In general, to include 10% of the data in a case with <span class="math inline">\(p\)</span> dimensions, we need an interval with each side of size <span class="math inline">\(\sqrt[p]{.10}\)</span> of the total. This proportion gets close to 1 quickly, and if the proportion is 1 it means we include all the data and are no longer smoothing.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/curse-of-dim-4_2563cb21f3b23db8835702209dc99a37">
-<div class="sourceCode" id="cb51"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
-<span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fl">1</span><span class="op">:</span><span class="fl">100</span></span>
-<span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/qplot.html">qplot</a></span><span class="op">(</span><span class="va">p</span>, <span class="fl">.1</span><span class="op">^</span><span class="op">(</span><span class="fl">1</span><span class="op">/</span><span class="va">p</span><span class="op">)</span>, ylim <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/curse-of-dim-4_c33e17ccd6949730334b2b7d56d63ec7">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/curse-of-dim-4-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -1185,43 +767,27 @@ <h1 class="title">
 </div>
 <p>By the time we reach 100 predictors, the neighborhood is no longer very local, as each side covers almost the entire dataset.</p>
 <p>Here we look at a set of elegant and versatile methods that adapt to higher dimensions and also allow these regions to take more complex shapes while still producing models that are interpretable. These are very popular, well-known and studied methods. We will concentrate on regression and decision trees and their extension to random forests.</p>
-</section><section id="cart-motivation" class="level3" data-number="29.5.2"><h3 data-number="29.5.2" class="anchored" data-anchor-id="cart-motivation">
-<span class="header-section-number">29.5.2</span> CART motivation</h3>
+</section><section id="cart-motivation" class="level3" data-number="30.4.2"><h3 data-number="30.4.2" class="anchored" data-anchor-id="cart-motivation">
+<span class="header-section-number">30.4.2</span> CART motivation</h3>
 <p>To motivate this section, we will use a new dataset that includes the breakdown of the composition of olive oil into 8 fatty acids:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-48_928a4b70eaa1670c4f131964296266a6">
-<div class="sourceCode" id="cb52"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
-<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">olive</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-20_890759b40d4a686a13623712c2741553">
+<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">olive</span><span class="op">)</span></span>
 <span><span class="co">#&gt;  [1] "region"      "area"        "palmitic"    "palmitoleic"</span></span>
 <span><span class="co">#&gt;  [5] "stearic"     "oleic"       "linoleic"    "linolenic"  </span></span>
 <span><span class="co">#&gt;  [9] "arachidic"   "eicosenoic"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>For illustrative purposes, we will try to predict the region using the fatty acid composition values as predictors.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-49_835bcf2fb645c7681b59ad646b5bf783">
-<div class="sourceCode" id="cb53"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/table.html">table</a></span><span class="op">(</span><span class="va">olive</span><span class="op">$</span><span class="va">region</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-21_46bac8233e1ecb1755bedcc367b61170">
+<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/table.html">table</a></span><span class="op">(</span><span class="va">olive</span><span class="op">$</span><span class="va">region</span><span class="op">)</span></span>
 <span><span class="co">#&gt; </span></span>
 <span><span class="co">#&gt; Northern Italy       Sardinia Southern Italy </span></span>
 <span><span class="co">#&gt;            151             98            323</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We remove the <code>area</code> column because we won’t use it as a predictor.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-50_612c4dfc910673e25f8ef493b4baaa90">
-<div class="sourceCode" id="cb54"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">olive</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">olive</span>, <span class="op">-</span><span class="va">area</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Let’s very quickly try to predict the region using kNN:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/olive-knn_3c7c1c0fd088d0bc924497e1d3b1c228">
-<div class="sourceCode" id="cb55"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span>
-<span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">region</span> <span class="op">~</span> <span class="va">.</span>,  method <span class="op">=</span> <span class="st">"knn"</span>, </span>
-<span>             tuneGrid <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>k <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">15</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span>, </span>
-<span>             data <span class="op">=</span> <span class="va">olive</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/olive-knn-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-22_17596230967cf5193d8163b7e99a07f3">
+<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">olive</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">olive</span>, <span class="op">-</span><span class="va">area</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</div>
-<p>We see that using just one neighbor, we can predict relatively well. However, a bit of data exploration reveals that we should be able to do even better. For example, if we look at the distribution of each predictor stratified by region we see that eicosenoic is only present in Southern Italy and that linoleic separates Northern Italy from Sardinia.</p>
+<p>Using kNN, we can achieve a test set accuracy of 0.9717332. However, a bit of data exploration reveals that we should be able to do even better. For example, if we look at the distribution of each predictor stratified by region we see that eicosenoic is only present in Southern Italy and that linoleic separates Northern Italy from Sardinia.</p>
 <div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/olive-eda_38abfdd4e32a80e916be89aaccbe3b82">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -1239,8 +805,9 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>In Section <a href="../highdim/linear-algebra.html#sec-predictor-space"><span>Section&nbsp;20.5</span></a> we define predictor spaces. The predictor space here consists of eight-dimensional points with values between 0 and 100. In the plot above, we show the space defined by the two predictors eicosenoic and linoleic, and, by eye, we can construct a prediction rule that partitions the predictor space so that each partition contains only outcomes of a one category. This in turn can be used to define an algorithm with perfect accuracy. Specifically, we define the following decision rule. If eicosenoic is larger than 0.065, predict Southern Italy. If not, then if linoleic is larger than <span class="math inline">\(10.535\)</span>, predict Sardinia, and if lower, predict Northern Italy. We can draw this decision tree like this:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/olive-tree_d4a08d6a173f8789a3b518e29a7d5f77">
+<p>In <a href="../highdim/linear-algebra.html#sec-predictor-space"><span>Section&nbsp;21.4</span></a>, we defined <em>predictor spaces</em>, which in this case consists of eight-dimensional points with values between 0 and 100. In the plot above, we show the space defined by the two predictors eicosenoic and linoleic, and, by eye, we can construct a prediction rule that partitions the predictor space so that each partition contains only outcomes of a one category.</p>
+<p>This in turn can be used to define an algorithm with perfect accuracy. Specifically, we define the following decision rule: if eicosenoic is larger than 0.065, predict Southern Italy. If not, then if linoleic is larger than <span class="math inline">\(10.535\)</span>, predict Sardinia, and if lower, predict Northern Italy. We can draw this decision tree as follows:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/olive-tree_6b2ec0d4d87a12cb39f0c04a542e4034">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/olive-tree-1.png" class="img-fluid figure-img" style="width:50.0%"></p>
@@ -1249,21 +816,13 @@ <h1 class="title">
 </div>
 </div>
 <p>Decision trees like this are often used in practice. For example, to decide on a person’s risk of poor outcome after having a heart attack, doctors use the following:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-51_6ee7acd53f8a9a67ad40ec2b5a0a0ec3">
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="img/Decision-Tree-for-Heart-Attack-Victim-adapted-from-Gigerenzer-et-al-1999-4.png" class="img-fluid figure-img" style="width:50.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>(Source: Walton 2010 Informal Logic, Vol. 30, No.&nbsp;2, pp.&nbsp;159-184<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>.)</p>
-<p>A tree is basically a flow chart of yes or no questions. The general idea of the methods we are describing is to define an algorithm that uses data to create these trees with predictions at the ends, referred to as <em>nodes</em>. Regression and decision trees operate by predicting an outcome variable <span class="math inline">\(Y\)</span> by partitioning the predictors.</p>
-</section><section id="regression-trees" class="level3" data-number="29.5.3"><h3 data-number="29.5.3" class="anchored" data-anchor-id="regression-trees">
-<span class="header-section-number">29.5.3</span> Regression trees</h3>
-<p>When the outcome is continuous, we call the method a <em>regression</em> tree. To introduce regression trees, we will use the 2008 poll data used in previous sections to describe the basic idea of how we build these algorithms. As with other machine learning algorithms, we will try to estimate the conditional expectation <span class="math inline">\(f(x) = \mbox{E}(Y | X = x)\)</span> with <span class="math inline">\(Y\)</span> the poll margin and <span class="math inline">\(x\)</span> the day.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-again_26be243a9a0fa8a3f623f1a1dcd0bcd4">
-<div class="sourceCode" id="cb56"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/qplot.html">qplot</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span>, data <span class="op">=</span> <span class="va">polls_2008</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><img src="img/Decision-Tree-for-Heart-Attack-Victim-adapted-from-Gigerenzer-et-al-1999-4.png" class="img-fluid"></p>
+<p>(Source: Walton 2010 Informal Logic, Vol. 30, No.&nbsp;2, pp.&nbsp;159-184<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>.)</p>
+<p>A tree is basically a flow chart of yes or no questions. The general idea of the methods we are describing is to define an algorithm that uses data to create these trees with predictions at the ends, referred to as <em>nodes</em>. Regression and decision trees operate by predicting an outcome variable <span class="math inline">\(y\)</span> by partitioning the predictors.</p>
+</section><section id="regression-trees" class="level3" data-number="30.4.3"><h3 data-number="30.4.3" class="anchored" data-anchor-id="regression-trees">
+<span class="header-section-number">30.4.3</span> Regression trees</h3>
+<p>When using trees, and the outcome is continuous, we call the approach a <em>regression</em> tree. To introduce regression trees, we will use the 2008 poll data used in previous sections to describe the basic idea of how we build these algorithms. As with other machine learning algorithms, we will try to estimate the conditional expectation <span class="math inline">\(f(x) = \mbox{E}(Y | X = x)\)</span> with <span class="math inline">\(Y\)</span> the poll margin and <span class="math inline">\(x\)</span> the day.</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-again_906f807503527ad342246bf8961ff051">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/polls-2008-again-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -1271,14 +830,14 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>The general idea here is to build a decision tree and, at the end of each <em>node</em>, obtain a predictor <span class="math inline">\(\hat{y}\)</span>. A mathematical way to describe this is to say that we are partitioning the predictor space into <span class="math inline">\(J\)</span> non-overlapping regions, <span class="math inline">\(R_1, R_2, \ldots, R_J\)</span>, and then for any predictor <span class="math inline">\(x\)</span> that falls within region <span class="math inline">\(R_j\)</span>, estimate <span class="math inline">\(f(x)\)</span> with the average of the training observations <span class="math inline">\(y_i\)</span> for which the associated predictor <span class="math inline">\(x_i\)</span> is also in <span class="math inline">\(R_j\)</span>.</p>
+<p>The general idea here is to build a decision tree and, at the end of each <em>node</em>, obtain a predictor <span class="math inline">\(\hat{y}\)</span>. A mathematical way to describe this is: we are partitioning the predictor space into <span class="math inline">\(J\)</span> non-overlapping regions, <span class="math inline">\(R_1, R_2, \ldots, R_J\)</span>, and then for any predictor <span class="math inline">\(x\)</span> that falls within region <span class="math inline">\(R_j\)</span>, estimate <span class="math inline">\(f(x)\)</span> with the average of the training observations <span class="math inline">\(y_i\)</span> for which the associated predictor <span class="math inline">\(x_i\)</span> is also in <span class="math inline">\(R_j\)</span>.</p>
 <p>But how do we decide on the partition <span class="math inline">\(R_1, R_2, \ldots, R_J\)</span> and how do we choose <span class="math inline">\(J\)</span>? Here is where the algorithm gets a bit complicated.</p>
-<p>Regression trees create partitions recursively. We start the algorithm with one partition, the entire predictor space. In our simple first example, this space is the interval [-155, 1]. But after the first step we will have two partitions. After the second step we will split one of these partitions into two and will have three partitions, then four, then five, and so on. We describe how we pick the partition to further partition, and when to stop, later.</p>
-<p>Once we select a partition <span class="math inline">\(\mathbf{x}\)</span> to split in order to create the new partitions, we find a predictor <span class="math inline">\(j\)</span> and value <span class="math inline">\(s\)</span> that define two new partitions, which we will call <span class="math inline">\(R_1(j,s)\)</span> and <span class="math inline">\(R_2(j,s)\)</span>, that split our observations in the current partition by asking if <span class="math inline">\(x_j\)</span> is bigger than <span class="math inline">\(s\)</span>:</p>
+<p>Regression trees create partitions recursively. We start the algorithm with one partition, the entire predictor space. In our simple first example, this space is the interval [-155, 1]. But after the first step, we will have two partitions. After the second step, we will split one of these partitions into two and will have three partitions, then four, and so on. Later we describe how we pick the partition to further partition, and when to stop.</p>
+<p>For each existing partition, we find a predictor <span class="math inline">\(j\)</span> and value <span class="math inline">\(s\)</span> that define two new partitions, which we will call <span class="math inline">\(R_1(j,s)\)</span> and <span class="math inline">\(R_2(j,s)\)</span>, that split our observations in the current partition by asking if <span class="math inline">\(x_j\)</span> is bigger than <span class="math inline">\(s\)</span>:</p>
 <p><span class="math display">\[
 R_1(j,s) = \{\mathbf{x} \mid x_j &lt; s\} \mbox{  and  } R_2(j,s) = \{\mathbf{x} \mid x_j \geq s\}
 \]</span></p>
-<p>In our current example we only have one predictor, so we will always choose <span class="math inline">\(j = 1\)</span>, but in general this will not be the case. Now, after we define the new partitions <span class="math inline">\(R_1\)</span> and <span class="math inline">\(R_2\)</span>, and we decide to stop the partitioning, we compute predictors by taking the average of all the observations <span class="math inline">\(y\)</span> for which the associated <span class="math inline">\(\mathbf{x}\)</span> is in <span class="math inline">\(R_1\)</span> and <span class="math inline">\(R_2\)</span>. We refer to these two as <span class="math inline">\(\hat{y}_{R_1}\)</span> and <span class="math inline">\(\hat{y}_{R_2}\)</span> respectively.</p>
+<p>In our current example, we only have one predictor, so we will always choose <span class="math inline">\(j = 1\)</span>, but in general this will not be the case. Now after we define the new partitions <span class="math inline">\(R_1\)</span> and <span class="math inline">\(R_2\)</span>, and we decide to stop the partitioning, we compute predictors by taking the average of all the observations <span class="math inline">\(y\)</span> for which the associated <span class="math inline">\(\mathbf{x}\)</span> is in <span class="math inline">\(R_1\)</span> and <span class="math inline">\(R_2\)</span>. We refer to these two as <span class="math inline">\(\hat{y}_{R_1}\)</span> and <span class="math inline">\(\hat{y}_{R_2}\)</span> respectively.</p>
 <p>But how do we pick <span class="math inline">\(j\)</span> and <span class="math inline">\(s\)</span>? Basically we find the pair that minimizes the residual sum of squares (RSS):</p>
 <p><span class="math display">\[
 \sum_{i:\, x_i \in R_1(j,s)} (y_i - \hat{y}_{R_1})^2 +
@@ -1286,13 +845,13 @@ <h1 class="title">
 \]</span></p>
 <p>This is then applied recursively to the new regions <span class="math inline">\(R_1\)</span> and <span class="math inline">\(R_2\)</span>. We describe how we stop later, but once we are done partitioning the predictor space into regions, in each region a prediction is made using the observations in that region.</p>
 <p>Let’s take a look at what this algorithm does on the 2008 presidential election poll data. We will use the <code>rpart</code> function in the <strong>rpart</strong> package.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-52_8de3a8370fd9a0b9c7a65eae5871783a">
-<div class="sourceCode" id="cb57"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/bethatkinson/rpart">rpart</a></span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-23_f5c2db1972e7a21d3c9ab5e373fe493c">
+<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/bethatkinson/rpart">rpart</a></span><span class="op">)</span></span>
 <span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/rpart/man/rpart.html">rpart</a></span><span class="op">(</span><span class="va">margin</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">polls_2008</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Here, there is only one predictor. Thus we do not have to decide which predictor <span class="math inline">\(j\)</span> to split by, we simply have to decide what value <span class="math inline">\(s\)</span> we use to split. We can visually see where the splits were made:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-53_d3b5b363f83f26e3b4a017f11d845132">
-<div class="sourceCode" id="cb58"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">fit</span>, margin <span class="op">=</span> <span class="fl">0.1</span><span class="op">)</span></span>
+<p>In this case, there is only one predictor. Thus we do not have to decide which predictor <span class="math inline">\(j\)</span> to split by, we simply have to decide what value <span class="math inline">\(s\)</span> we use to split. We can visually see where the splits were made:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-24_ce5c86579ebb515c6bdab0075382e55d">
+<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">fit</span>, margin <span class="op">=</span> <span class="fl">0.1</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/text.html">text</a></span><span class="op">(</span><span class="va">fit</span>, cex <span class="op">=</span> <span class="fl">0.75</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-tree_e2d463709179e474f72ab40e6ca271e9">
@@ -1305,7 +864,7 @@ <h1 class="title">
 </div>
 <p>The first split is made on day 39.5. One of those regions is then split at day 86.5. The two resulting new partitions are split on days 49.5 and 117.5, respectively, and so on. We end up with 8 partitions. The final estimate <span class="math inline">\(\hat{f}(x)\)</span> looks like this:</p>
 <div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-tree-fit_0a0828806ee985434aec52ea0c487d4a">
-<div class="sourceCode" id="cb59"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls_2008</span> <span class="op">|&gt;</span> </span>
+<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls_2008</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>y_hat <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
@@ -1317,18 +876,10 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Note that the algorithm stopped partitioning at 8. Now we explain how this decision is made.</p>
-<p>First we need to define the term <em>complexity parameter</em> (cp). Every time we split and define two new partitions, our training set RSS decreases. This is because with more partitions, our model has more flexibility to adapt to the training data. In fact, if you split until every point is its own partition, then RSS goes all the way down to 0 since the average of one value is that same value. To avoid this, the algorithm sets a minimum for how much the RSS must improve for another partition to be added. This parameter is referred to as the <em>complexity parameter</em> (cp). The RSS must improve by a factor of cp for the new partition to be added. Large values of cp will therefore force the algorithm to stop earlier which results in fewer nodes.</p>
+<p>Note that the algorithm stopped partitioning at 8. The decision is made based on a measure referred to as <em>complexity parameter</em> (cp). Every time we split and define two new partitions, our training set RSS decreases. This is because with more partitions, our model has more flexibility to adapt to the training data. In fact, if you split until every point is its own partition, then RSS goes all the way down to 0 since the average of one value is that same value. To avoid this, the algorithm sets a minimum for how much the RSS must improve for another partition to be added. This parameter is referred to as the <em>complexity parameter</em> (cp). The RSS must improve by a factor of cp for the new partition to be added. Large values of cp will therefore force the algorithm to stop earlier, which results in fewer nodes.</p>
 <p>However, cp is not the only parameter used to decide if we should partition a current partition or not. Another common parameter is the minimum number of observations required in a partition before partitioning it further. The argument used in the <code>rpart</code> function is <code>minsplit</code> and the default is 20. The <code>rpart</code> implementation of regression trees also permits users to determine a minimum number of observations in each node. The argument is <code>minbucket</code> and defaults to <code>round(minsplit/3)</code>.</p>
 <p>As expected, if we set <code>cp = 0</code> and <code>minsplit = 2</code>, then our prediction is as flexible as possible and our predictor is our original data:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-tree-over-fit_0e5973b1fca9a7ed6c959a8e0ee90c93">
-<div class="sourceCode" id="cb60"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/rpart/man/rpart.html">rpart</a></span><span class="op">(</span><span class="va">margin</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">polls_2008</span>, </span>
-<span>             control <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/pkg/rpart/man/rpart.control.html">rpart.control</a></span><span class="op">(</span>cp <span class="op">=</span> <span class="fl">0</span>, minsplit <span class="op">=</span> <span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="va">polls_2008</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>y_hat <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_path.html">geom_step</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">y_hat</span><span class="op">)</span>, col <span class="op">=</span> <span class="st">"red"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-tree-over-fit_5db3d38fb9a628a8a499571e8e9723bf">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/polls-2008-tree-over-fit-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -1337,41 +888,8 @@ <h1 class="title">
 </div>
 </div>
 <p>Intuitively we know that this is not a good approach as it will generally result in over-training. These <code>cp</code>, <code>minsplit</code>, and <code>minbucket</code>, three parameters can be used to control the variability of the final predictors. The larger these values are the more data is averaged to compute a predictor and thus reduce variability. The drawback is that it restricts flexibility.</p>
-<p>So how do we pick these parameters? We can use cross validation, described in Chapter <a href="cross-validation.html"><span>Chapter&nbsp;28</span></a>, just like with any tuning parameter. Here is an example of using cross validation to choose cp.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-tree-train_8394f7d8d01c0e928fd39a1fa1bca6ee">
-<div class="sourceCode" id="cb61"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span>
-<span><span class="va">train_rpart</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">margin</span> <span class="op">~</span> <span class="va">.</span>, </span>
-<span>                     method <span class="op">=</span> <span class="st">"rpart"</span>,</span>
-<span>                     tuneGrid <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>cp <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">0.05</span>, len <span class="op">=</span> <span class="fl">25</span><span class="op">)</span><span class="op">)</span>,</span>
-<span>                     data <span class="op">=</span> <span class="va">polls_2008</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="va">train_rpart</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/polls-2008-tree-train-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>To see the resulting tree, we access the <code>finalModel</code> and plot it:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-54_7525767ecea139b44a7d1e059eb7f523">
-<div class="sourceCode" id="cb62"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">train_rpart</span><span class="op">$</span><span class="va">finalModel</span>, margin <span class="op">=</span> <span class="fl">0.1</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/graphics/text.html">text</a></span><span class="op">(</span><span class="va">train_rpart</span><span class="op">$</span><span class="va">finalModel</span>, cex <span class="op">=</span> <span class="fl">0.75</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-final-model_eb30ffc5656261566d520bf01e17de66">
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/polls-2008-final-model-1.png" class="img-fluid figure-img" style="width:80.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>And because we only have one predictor, we can actually plot <span class="math inline">\(\hat{f}(x)\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-final-fit_19e1f2f559e8f84386ee98f71e9304f2">
-<div class="sourceCode" id="cb63"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls_2008</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>y_hat <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_rpart</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_path.html">geom_step</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">y_hat</span><span class="op">)</span>, col <span class="op">=</span> <span class="st">"red"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>So how do we pick these parameters? We can use cross validation, just like with any tuning parameter. Here is the resulting tree when we use cross validation to choose cp:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-final-fit_cc1abcb64e3ed11535dbd7adab4552b0">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/polls-2008-final-fit-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -1379,45 +897,26 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Note that if we already have a tree and want to apply a higher cp value, we can use the <code>prune</code> function. We call this <em>pruning</em> a tree because we are snipping off partitions that do not meet a <code>cp</code> criterion. We previously created a tree that used a <code>cp = 0</code> and saved it to <code>fit</code>. We can prune it like this:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-prune_443b30f09920b8b35d4a2d2ef24fc8fa">
-<div class="sourceCode" id="cb64"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">pruned_fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/rpart/man/prune.rpart.html">prune</a></span><span class="op">(</span><span class="va">fit</span>, cp <span class="op">=</span> <span class="fl">0.01</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Note that if we already have a tree and want to apply a higher cp value, we can use the <code>prune</code> function. We call this <em>pruning</em> a tree because we are snipping off partitions that do not meet a <code>cp</code> criterion. Here is an example where we create a tree that used a <code>cp = 0</code> and then we prune it back:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-prune_8e0ffd557127be2be872c3051370bfe0">
+<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/rpart/man/rpart.html">rpart</a></span><span class="op">(</span><span class="va">margin</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">polls_2008</span>, control <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/pkg/rpart/man/rpart.control.html">rpart.control</a></span><span class="op">(</span>cp <span class="op">=</span> <span class="fl">0</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">pruned_fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/rpart/man/prune.rpart.html">prune</a></span><span class="op">(</span><span class="va">fit</span>, cp <span class="op">=</span> <span class="fl">0.01</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="classification-decision-trees" class="level3" data-number="29.5.4"><h3 data-number="29.5.4" class="anchored" data-anchor-id="classification-decision-trees">
-<span class="header-section-number">29.5.4</span> Classification (decision) trees</h3>
+</section><section id="classification-decision-trees" class="level3" data-number="30.4.4"><h3 data-number="30.4.4" class="anchored" data-anchor-id="classification-decision-trees">
+<span class="header-section-number">30.4.4</span> Classification (decision) trees</h3>
 <p>Classification trees, or decision trees, are used in prediction problems where the outcome is categorical. We use the same partitioning principle with some differences to account for the fact that we are now working with a categorical outcome.</p>
 <p>The first difference is that we form predictions by calculating which class is the most common among the training set observations within the partition, rather than taking the average in each partition (as we can’t take the average of categories).</p>
 <p>The second is that we can no longer use RSS to choose the partition. While we could use the naive approach of looking for partitions that minimize training error, better performing approaches use more sophisticated metrics. Two of the more popular ones are the <em>Gini Index</em> and <em>Entropy</em>.</p>
-<p>In a perfect scenario, the outcomes in each of our partitions are all of the same category since this will permit perfect accuracy. The <em>Gini Index</em> is going to be 0 in this scenario, and become larger the more we deviate from this scenario. To define the Gini Index, we define <span class="math inline">\(\hat{p}_{j,k}\)</span> as the proportion of observations in partition <span class="math inline">\(j\)</span> that are of class <span class="math inline">\(k\)</span>. The Gini Index is defined as</p>
+<p>In a perfect scenario, the outcomes in each of our partitions are all of the same category since this will permit perfect accuracy. The <em>Gini Index</em> is going to be 0 in this scenario, and become larger the more we deviate from this scenario. To define the Gini Index, we define <span class="math inline">\(\hat{p}_{j,k}\)</span> as the proportion of observations in partition <span class="math inline">\(j\)</span> that are of class <span class="math inline">\(k\)</span>. The Gini Index is defined as:</p>
 <p><span class="math display">\[
 \mbox{Gini}(j) = \sum_{k = 1}^K \hat{p}_{j,k}(1-\hat{p}_{j,k})
 \]</span></p>
-<p>If you study the formula carefully you will see that it is in fact 0 in the perfect scenario described above.</p>
-<p><em>Entropy</em> is a very similar quantity, defined as</p>
+<p>If you study the formula carefully, you will see that it is in fact 0 in the perfect scenario described above.</p>
+<p><em>Entropy</em> is a very similar quantity, defined as:</p>
 <p><span class="math display">\[
 \mbox{entropy}(j) = -\sum_{k = 1}^K \hat{p}_{j,k}\log(\hat{p}_{j,k}), \mbox{ with } 0 \times \log(0) \mbox{ defined as }0
 \]</span></p>
-<p>Let us look at how a classification tree performs on the digits example we examined before by using this code to run the algorithm and plot the resulting accuracy:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/mnist-27-tree_7edceb9e4facb3b9b7ceac039f154870">
-<div class="sourceCode" id="cb65"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_rpart</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>,</span>
-<span>                     method <span class="op">=</span> <span class="st">"rpart"</span>,</span>
-<span>                     tuneGrid <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>cp <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0.0</span>, <span class="fl">0.1</span>, len <span class="op">=</span> <span class="fl">25</span><span class="op">)</span><span class="op">)</span>,</span>
-<span>                     data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">train_rpart</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/mnist-27-tree-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>The accuracy achieved by this approach is better than what we got with regression, but is not as good as what we achieved with kernel methods:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-57_be7fe7f4645f55651dff7243f9b145ba">
-<div class="sourceCode" id="cb66"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_rpart</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
-<span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;     0.82</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
+<p>If we use a classification tree on the 2 or 7 example, we achieve an accuracy of 0.81 which is better than regression, but is not as good as what we achieved with kernel methods.</p>
 <p>The plot of the estimated conditional probability shows us the limitations of classification trees:</p>
 <div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/rf-cond-prob_606d623ed56f9a09b65ef160cf41188d">
 <div class="cell-output-display">
@@ -1428,25 +927,25 @@ <h1 class="title">
 </div>
 </div>
 <p>Note that with decision trees, it is difficult to make the boundaries smooth since each partition creates a discontinuity.</p>
-<p>Classification trees have certain advantages that make them very useful. They are highly interpretable, even more so than linear models. They are easy to visualize (if small enough). Finally, they can model human decision processes and don’t require use of dummy predictors for categorical variables. On the other hand, the approach via recursive partitioning can easily over-train and is therefore a bit harder to train than, for example, linear regression or kNN. Furthermore, in terms of accuracy, it is rarely the best performing method since it is not very flexible and is highly unstable to changes in training data. Random forests, explained next, improve on several of these shortcomings.</p>
-</section><section id="random-forests" class="level3" data-number="29.5.5"><h3 data-number="29.5.5" class="anchored" data-anchor-id="random-forests">
-<span class="header-section-number">29.5.5</span> Random forests</h3>
-<p>Random forests are a <strong>very popular</strong> machine learning approach that addresses the shortcomings of decision trees using a clever idea. The goal is to improve prediction performance and reduce instability by <em>averaging</em> multiple decision trees (a forest of trees constructed with randomness). It has two features that help accomplish this.</p>
+<p>Classification trees have certain advantages that make them very useful. They are highly interpretable, even more so than linear models. They are also easy to visualize (if small enough). Finally, they can model human decision processes and don’t require use of dummy predictors for categorical variables. On the other hand, the approach via recursive partitioning can easily over-train and is therefore a bit harder to train than, for example, linear regression or kNN. Furthermore, in terms of accuracy, it is rarely the best performing method since it is not very flexible and is highly unstable to changes in training data. Random forests, explained next, improve on several of these shortcomings.</p>
+</section></section><section id="random-forests" class="level2" data-number="30.5"><h2 data-number="30.5" class="anchored" data-anchor-id="random-forests">
+<span class="header-section-number">30.5</span> Random forests</h2>
+<p>Random forests are a <strong>very popular</strong> machine learning approach that addresses the shortcomings of decision trees using a clever idea. The goal is to improve prediction performance and reduce instability by <em>averaging</em> multiple decision trees: a <em>forest</em> of trees constructed with <em>randomness</em>. It has two features that help accomplish this.</p>
 <p>The first step is <em>bootstrap aggregation</em> or <em>bagging</em>. The general idea is to generate many predictors, each using regression or classification trees, and then forming a final prediction based on the average prediction of all these trees. To assure that the individual trees are not the same, we use the bootstrap to induce randomness. These two features combined explain the name: the bootstrap makes the individual trees <strong>randomly</strong> different, and the combination of trees is the <strong>forest</strong>. The specific steps are as follows.</p>
-<p>1. Build <span class="math inline">\(B\)</span> decision trees using the training set. We refer to the fitted models as <span class="math inline">\(T_1, T_2, \dots, T_B\)</span>. We later explain how we ensure they are different.</p>
+<p>1. Build <span class="math inline">\(B\)</span> decision trees using the training set. We refer to the fitted models as <span class="math inline">\(T_1, T_2, \dots, T_B\)</span>.</p>
 <p>2. For every observation in the test set, form a prediction <span class="math inline">\(\hat{y}_j\)</span> using tree <span class="math inline">\(T_j\)</span>.</p>
 <p>3. For continuous outcomes, form a final prediction with the average <span class="math inline">\(\hat{y} = \frac{1}{B} \sum_{j = 1}^B \hat{y}_j\)</span>. For categorical data classification, predict <span class="math inline">\(\hat{y}\)</span> with majority vote (most frequent class among <span class="math inline">\(\hat{y}_1, \dots, \hat{y}_T\)</span>).</p>
 <p>So how do we get different decision trees from a single training set? For this, we use randomness in two ways which we explain in the steps below. Let <span class="math inline">\(N\)</span> be the number of observations in the training set. To create <span class="math inline">\(T_j, \, j = 1,\ldots,B\)</span> from the training set we do the following:</p>
 <p>1. Create a bootstrap training set by sampling <span class="math inline">\(N\)</span> observations from the training set <strong>with replacement</strong>. This is the first way to induce randomness.</p>
 <p>2. A large number of features is typical in machine learning challenges. Often, many features can be informative but including them all in the model may result in overfitting. The second way random forests induce randomness is by randomly selecting features to be included in the building of each tree. A different random subset is selected for each tree. This reduces correlation between trees in the forest, thereby improving prediction accuracy.</p>
-<p>To illustrate how the first steps can result in smoother estimates we will demonstrate by fitting a random forest to the 2008 polls data. We will use the <code>randomForest</code> function in the <strong>randomForest</strong> package:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-rf_0bc6a628e9da8ec7ca98257e2abf1c26">
-<div class="sourceCode" id="cb67"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://www.stat.berkeley.edu/~breiman/RandomForests/">randomForest</a></span><span class="op">)</span></span>
-<span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/randomForest/man/randomForest.html">randomForest</a></span><span class="op">(</span><span class="va">margin</span><span class="op">~</span><span class="va">.</span>, data <span class="op">=</span> <span class="va">polls_2008</span><span class="op">)</span> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Note that if we apply the function <code>plot</code> to the resulting object, stored in <code>fit</code>, we see how the error rate of our algorithm changes as we add trees.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-58_dce4bebaa62583a08dda99477b64b4d9">
-<div class="sourceCode" id="cb68"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">rafalib</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/rafalib/man/mypar.html">mypar</a></span><span class="op">(</span><span class="op">)</span></span>
+<p>To illustrate how the first steps can result in smoother estimates, we will fit a random forest to the 2008 polls data. We will use the <code>randomForest</code> function in the <strong>randomForest</strong> package:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-rf_dc2be9d9b28ad25af115590d3f74116b">
+<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://www.stat.berkeley.edu/~breiman/RandomForests/">randomForest</a></span><span class="op">)</span></span>
+<span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/randomForest/man/randomForest.html">randomForest</a></span><span class="op">(</span><span class="va">margin</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">polls_2008</span><span class="op">)</span> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Note that if we apply the function <code>plot</code> to the resulting object, we see how the error rate of our algorithm changes as we add trees:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-26_4fca9250ebb4f81d971f6340bcf42534">
+<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">rafalib</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/rafalib/man/mypar.html">mypar</a></span><span class="op">(</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">fit</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/more-trees-better-fit_2f23dae5d501f657f3780348b4f1dae2">
@@ -1457,14 +956,13 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>We can see that in this case, the accuracy improves as we add more trees until about 30 trees where accuracy stabilizes.</p>
-<p>The resulting estimate for this random forest can be seen like this:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-rf-fit_391d0e35434e26a968cf8371e52a0760">
-<div class="sourceCode" id="cb69"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls_2008</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>y_hat <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit</span>, newdata <span class="op">=</span> <span class="va">polls_2008</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_path.html">geom_line</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">y_hat</span><span class="op">)</span>, col <span class="op">=</span> <span class="st">"red"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>In this case, the accuracy improves as we add more trees until we have used about 30 trees after which accuracy stabilizes.</p>
+<p>The resulting estimate for this random forest, obtained with</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-27_cf8c21db77f1930041bbbf7840645df6">
+<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a>y_hat <span class="ot">&lt;-</span>  <span class="fu">predict</span>(fit, <span class="at">newdata =</span> polls_2008</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>is shown with the red curve below:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/polls-2008-rf-fit_07a0f82562dbd5df476d1af7bf32035f">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/polls-2008-rf-fit-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -1472,7 +970,7 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Notice that the random forest estimate is much smoother than what we achieved with the regression tree in the previous section. This is possible because the average of many step functions can be smooth. We can see this by visually examining how the estimate changes as we add more trees. In the following figure you see each of the bootstrap samples for several values of <span class="math inline">\(b\)</span> and for each one we see the tree that is fitted in grey, the previous trees that were fitted in lighter grey, and the result of averaging all the trees estimated up to that point.</p>
+<p>Notice that the random forest estimate is much smoother than what we achieved with the regression tree in the previous section. This is possible because the average of many step functions can be smooth. We can see this by visually examining how the estimate changes as we add more trees. In the following figure, you see each of the bootstrap samples for several values of <span class="math inline">\(b\)</span> and for each one we see the tree that is fitted in grey, the previous trees that were fitted in lighter grey, and the result of averaging all the trees estimated up to that point.</p>
 <div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/rf-animation_9a1585d2fc401a062876bf24017ac00e">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -1481,30 +979,12 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Here is the random forest fit for our digits example based on two predictors:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/mnits-27-rf-fit_7506ef5042c51dbc52a9029e3a5c9355">
-<div class="sourceCode" id="cb70"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://www.stat.berkeley.edu/~breiman/RandomForests/">randomForest</a></span><span class="op">)</span></span>
-<span><span class="va">train_rf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/randomForest/man/randomForest.html">randomForest</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">)</span></span>
-<span></span>
-<span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_rf</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">)</span>,</span>
-<span>                <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
-<span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;     0.79</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/mnits-27-rf-fit_fb139ec0917899790ee419ba1be18cee">
+<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://www.stat.berkeley.edu/~breiman/RandomForests/">randomForest</a></span><span class="op">)</span></span>
+<span><span class="va">train_rf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/randomForest/man/randomForest.html">randomForest</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Here is what the conditional probabilities look like:</p>
+<p>The accuracy for the random forest fit for our 2 or 7 example is <code>confusionMatrix(predict(train_rf, mnist_27$test), mnist_27$test$y)$overall["Accuracy"</code>. Here is what the conditional probabilities look like:</p>
 <div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/cond-prob-rf_bdd61a99ddbbfe4d0417a68fc91ff763">
-<pre><code>#&gt; Warning: The following aesthetics were dropped during statistical
-#&gt; transformation: fill
-#&gt; ℹ This can happen when ggplot fails to infer the correct grouping
-#&gt;   structure in the data.
-#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a
-#&gt;   numerical variable into a factor?
-#&gt; The following aesthetics were dropped during statistical
-#&gt; transformation: fill
-#&gt; ℹ This can happen when ggplot fails to infer the correct grouping
-#&gt;   structure in the data.
-#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a
-#&gt;   numerical variable into a factor?</code></pre>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/cond-prob-rf-1.png" class="img-fluid figure-img" style="width:100.0%"></p>
@@ -1512,46 +992,8 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Visualizing the estimate shows that, although we obtain high accuracy, it appears that there is room for improvement by making the estimate smoother. This could be achieved by changing the parameter that controls the minimum number of data points in the nodes of the tree. The larger this minimum, the smoother the final estimate will be. We can train the parameters of the random forest. Below, we use the <strong>caret</strong> package to optimize over the minimum node size. Because, this is not one of the parameters that the <strong>caret</strong> package optimizes by default we will write our own code:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/acc-versus-nodesize_54f4761f1bffdf6c2af091eef9c59f94">
-<div class="sourceCode" id="cb72"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">nodesize</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">51</span>, <span class="fl">10</span><span class="op">)</span></span>
-<span><span class="va">acc</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">nodesize</span>, <span class="kw">function</span><span class="op">(</span><span class="va">ns</span><span class="op">)</span><span class="op">{</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"rf"</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>,</span>
-<span>               tuneGrid <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>mtry <span class="op">=</span> <span class="fl">2</span><span class="op">)</span>,</span>
-<span>               nodesize <span class="op">=</span> <span class="va">ns</span><span class="op">)</span><span class="op">$</span><span class="va">results</span><span class="op">$</span><span class="va">Accuracy</span></span>
-<span><span class="op">}</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/qplot.html">qplot</a></span><span class="op">(</span><span class="va">nodesize</span>, <span class="va">acc</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="algorithms_files/figure-html/acc-versus-nodesize-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>We can now fit the random forest with the optimized minimun node size to the entire training data and evaluate performance on the test data.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-59_76e9af914ba57c85402f26c1976f49c6">
-<div class="sourceCode" id="cb73"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_rf_2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/randomForest/man/randomForest.html">randomForest</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>,</span>
-<span>                           nodesize <span class="op">=</span> <span class="va">nodesize</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/which.min.html">which.max</a></span><span class="op">(</span><span class="va">acc</span><span class="op">)</span><span class="op">]</span><span class="op">)</span></span>
-<span></span>
-<span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_rf_2</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">)</span>,</span>
-<span>                <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
-<span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;     0.83</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>The selected model improves accuracy and provides a smoother estimate.</p>
+<p>Visualizing the estimate shows that, although we obtain high accuracy, it appears that there is room for improvement by making the estimate smoother. This could be achieved by changing the parameter that controls the minimum number of data points in the nodes of the tree. The larger this minimum, the smoother the final estimate will be. If we use a node size of 31, the number of neighbors we used with kNN, we get an accuracy of <code>confusionMatrix(predict(train_rf_2, mnist_27$test), mnist_27$test$y)$overall["Accuracy"]</code>. The selected model improves accuracy and provides a smoother estimate:</p>
 <div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/cond-prob-final-rf_3349905f873cf26cf486edd3d1a8a559">
-<pre><code>#&gt; Warning: The following aesthetics were dropped during statistical
-#&gt; transformation: fill
-#&gt; ℹ This can happen when ggplot fails to infer the correct grouping
-#&gt;   structure in the data.
-#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a
-#&gt;   numerical variable into a factor?
-#&gt; The following aesthetics were dropped during statistical
-#&gt; transformation: fill
-#&gt; ℹ This can happen when ggplot fails to infer the correct grouping
-#&gt;   structure in the data.
-#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a
-#&gt;   numerical variable into a factor?</code></pre>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="algorithms_files/figure-html/cond-prob-final-rf-1.png" class="img-fluid figure-img" style="width:100.0%"></p>
@@ -1559,20 +1001,20 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Note that we can avoid writing our own code by using other random forest implementations as described in the <strong>caret</strong> manual<a href="#fn4" class="footnote-ref" id="fnref4" role="doc-noteref"><sup>4</sup></a>.</p>
-<p>Random forest performs better in all the examples we have considered. However, a disadvantage of random forests is that we lose interpretability. An approach that helps with interpretability is to examine <em>variable importance</em>. To define <em>variable importance</em> we count how often a predictor is used in the individual trees. You can learn more about <em>variable importance</em> in an advanced machine learning book<a href="#fn5" class="footnote-ref" id="fnref5" role="doc-noteref"><sup>5</sup></a>. The <strong>caret</strong> package includes the function <code>varImp</code> that extracts variable importance from any model in which the calculation is implemented. We give an example on how we use variable importance in the next section.</p>
-</section></section><section id="exercises" class="level2" data-number="29.6"><h2 data-number="29.6" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">29.6</span> Exercises</h2>
-<p>1. Create a dataset using the following code.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-60_151cc0ebdd803adceb4ea778ca23d53d">
-<div class="sourceCode" id="cb75"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">100</span></span>
+<p>Random forest performs better than trees in all the examples we have considered. However, a disadvantage of random forests is that we lose interpretability. An approach that helps with interpretability is to examine <em>variable importance</em>. To define <em>variable importance</em>, we count how often a predictor is used in the individual trees. You can learn more about <em>variable importance</em> in an advanced machine learning book<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>. The <strong>caret</strong> package includes the function <code>varImp</code> that extracts variable importance from any model in which the calculation is implemented. We give an example on how we use variable importance in the next section.</p>
+</section><section id="exercises" class="level2" data-number="30.6"><h2 data-number="30.6" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">30.6</span> Exercises</h2>
+<p>1. Create a dataset using the following code:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-29_a558667124c636a6cff0494b08152225">
+<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">100</span></span>
 <span><span class="va">Sigma</span> <span class="op">&lt;-</span> <span class="fl">9</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1.0</span>, <span class="fl">0.5</span>, <span class="fl">0.5</span>, <span class="fl">1.0</span><span class="op">)</span>, <span class="fl">2</span>, <span class="fl">2</span><span class="op">)</span></span>
 <span><span class="va">dat</span> <span class="op">&lt;-</span> <span class="fu">MASS</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/MASS/man/mvrnorm.html">mvrnorm</a></span><span class="op">(</span>n <span class="op">=</span> <span class="fl">100</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">69</span>, <span class="fl">69</span><span class="op">)</span>, <span class="va">Sigma</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/stats/setNames.html">setNames</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"x"</span>, <span class="st">"y"</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Use the <strong>caret</strong> package to partition into a test and training set of equal size. Train a linear model and report the RMSE. Repeat this exercise 100 times and make a histogram of the RMSEs and report the average and standard deviation. Hint: adapt the code shown earlier like this:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-61_0f195989337f35aa12c10920a67bdadd">
-<div class="sourceCode" id="cb76"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">dat</span><span class="op">$</span><span class="va">y</span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-30_ad3f2906004c53434825ccc2d8b77eac">
+<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">dat</span><span class="op">$</span><span class="va">y</span></span>
 <span><span class="va">test_index</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/createDataPartition.html">createDataPartition</a></span><span class="op">(</span><span class="va">y</span>, times <span class="op">=</span> <span class="fl">1</span>, p <span class="op">=</span> <span class="fl">0.5</span>, list <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
 <span><span class="va">train_set</span> <span class="op">&lt;-</span> <span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/slice.html">slice</a></span><span class="op">(</span><span class="op">-</span><span class="va">test_index</span><span class="op">)</span></span>
 <span><span class="va">test_set</span> <span class="op">&lt;-</span> <span class="va">dat</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/slice.html">slice</a></span><span class="op">(</span><span class="va">test_index</span><span class="op">)</span></span>
@@ -1591,14 +1033,14 @@ <h1 class="title">
 <li>The RMSE is not a random variable.</li>
 </ol>
 <p>4. Now repeat exercise 1, but this time make the correlation between <code>x</code> and <code>y</code> larger by changing <code>Sigma</code> like this:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-62_66ab70ffec98539036b65c30aaea2a26">
-<div class="sourceCode" id="cb77"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">100</span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-31_8c0a21726704adbe59c820b8ce7a93b7">
+<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">100</span></span>
 <span><span class="va">Sigma</span> <span class="op">&lt;-</span> <span class="fl">9</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">0.95</span>, <span class="fl">0.95</span>, <span class="fl">1</span><span class="op">)</span>, <span class="fl">2</span>, <span class="fl">2</span><span class="op">)</span></span>
 <span><span class="va">dat</span> <span class="op">&lt;-</span> <span class="fu">MASS</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/MASS/man/mvrnorm.html">mvrnorm</a></span><span class="op">(</span>n <span class="op">=</span> <span class="fl">100</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">69</span>, <span class="fl">69</span><span class="op">)</span>, <span class="va">Sigma</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/stats/setNames.html">setNames</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"x"</span>, <span class="st">"y"</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Repeat the exercise and note what happens to the RMSE now.</p>
-<p>5. Which of the following best explains why the RMSE in exercise 4 is so much lower than exercise 1.</p>
+<p>5. Which of the following best explains why the RMSE in exercise 4 is so much lower than exercise 1:</p>
 <ol type="a">
 <li>It is just luck. If we do it again, it will be larger.</li>
 <li>The Central Limit Theorem tells us the RMSE is normal.</li>
@@ -1606,20 +1048,20 @@ <h1 class="title">
 <li>These are both examples of regression, so the RMSE has to be the same.</li>
 </ol>
 <p>6. Create a dataset using the following code:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-63_3453452911c173b7eab1479030bf8f06">
-<div class="sourceCode" id="cb78"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-32_ba1100e3649f254a6251b20a026d82df">
+<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
 <span><span class="va">Sigma</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">3</span><span class="op">/</span><span class="fl">4</span>, <span class="fl">3</span><span class="op">/</span><span class="fl">4</span>, <span class="fl">3</span><span class="op">/</span><span class="fl">4</span>, <span class="fl">1</span>, <span class="fl">0</span>, <span class="fl">3</span><span class="op">/</span><span class="fl">4</span>, <span class="fl">0</span>, <span class="fl">1</span><span class="op">)</span>, <span class="fl">3</span>, <span class="fl">3</span><span class="op">)</span></span>
 <span><span class="va">dat</span> <span class="op">&lt;-</span> <span class="fu">MASS</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/MASS/man/mvrnorm.html">mvrnorm</a></span><span class="op">(</span>n <span class="op">=</span> <span class="fl">100</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">0</span><span class="op">)</span>, <span class="va">Sigma</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/stats/setNames.html">setNames</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"y"</span>, <span class="st">"x_1"</span>, <span class="st">"x_2"</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Note that <code>y</code> is correlated with both <code>x_1</code> and <code>x_2</code>, but the two predictors are independent of each other.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-64_7788d10cf0df472252fa6df536c6d50d">
-<div class="sourceCode" id="cb79"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">dat</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-33_f0f72a47cad1fd65f1247d707a51f2fb">
+<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/cor.html">cor</a></span><span class="op">(</span><span class="va">dat</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Use the <strong>caret</strong> package to partition into a test and training set of equal size. Compare the RMSE when using just <code>x_1</code>, just <code>x_2</code>, and both <code>x_1</code> and <code>x_2</code>. Train a linear model and report the RMSE.</p>
 <p>7. Repeat exercise 6 but now create an example in which <code>x_1</code> and <code>x_2</code> are highly correlated:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-65_bd60a3886d0ac39f1e326e818feeb598">
-<div class="sourceCode" id="cb80"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-34_8c808cfab88b119f2611d4368837d51f">
+<div class="sourceCode" id="cb33"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
 <span><span class="va">Sigma</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1.0</span>, <span class="fl">0.75</span>, <span class="fl">0.75</span>, <span class="fl">0.75</span>, <span class="fl">1.0</span>, <span class="fl">0.95</span>, <span class="fl">0.75</span>, <span class="fl">0.95</span>, <span class="fl">1.0</span><span class="op">)</span>, <span class="fl">3</span>, <span class="fl">3</span><span class="op">)</span></span>
 <span><span class="va">dat</span> <span class="op">&lt;-</span> <span class="fu">MASS</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/MASS/man/mvrnorm.html">mvrnorm</a></span><span class="op">(</span>n <span class="op">=</span> <span class="fl">100</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">0</span>, <span class="fl">0</span><span class="op">)</span>, <span class="va">Sigma</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/stats/setNames.html">setNames</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"y"</span>, <span class="st">"x_1"</span>, <span class="st">"x_2"</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -1633,8 +1075,8 @@ <h1 class="title">
 <li>Unless we include all predictors, we have no predicting power.</li>
 </ol>
 <p>9. Define the following dataset:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-66_6d7e4ab9cfc8e04191524c4aab351805">
-<div class="sourceCode" id="cb81"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">make_data</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">n</span> <span class="op">=</span> <span class="fl">1000</span>, <span class="va">p</span> <span class="op">=</span> <span class="fl">0.5</span>, </span>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-35_5e3973fadef9459263cdf455cb8ef5e0">
+<div class="sourceCode" id="cb34"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">make_data</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">n</span> <span class="op">=</span> <span class="fl">1000</span>, <span class="va">p</span> <span class="op">=</span> <span class="fl">0.5</span>, </span>
 <span>                      <span class="va">mu_0</span> <span class="op">=</span> <span class="fl">0</span>, <span class="va">mu_1</span> <span class="op">=</span> <span class="fl">2</span>, </span>
 <span>                      <span class="va">sigma_0</span> <span class="op">=</span> <span class="fl">1</span>,  <span class="va">sigma_1</span> <span class="op">=</span> <span class="fl">1</span><span class="op">)</span><span class="op">{</span></span>
 <span>  <span class="va">y</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Binomial.html">rbinom</a></span><span class="op">(</span><span class="va">n</span>, <span class="fl">1</span>, <span class="va">p</span><span class="op">)</span></span>
@@ -1649,92 +1091,102 @@ <h1 class="title">
 <span><span class="op">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Note that we have defined a variable <code>x</code> that is predictive of a binary outcome <code>y</code>.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-67_6c632e484a3a0836580d6870959d5d95">
-<div class="sourceCode" id="cb82"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span><span class="op">$</span><span class="va">train</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x</span>, color <span class="op">=</span> <span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_density.html">geom_density</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-36_1f3c4cbf34d709592b4487c681335340">
+<div class="sourceCode" id="cb35"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span><span class="op">$</span><span class="va">train</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x</span>, color <span class="op">=</span> <span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_density.html">geom_density</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Compare the accuracy of linear regression and logistic regression.</p>
 <p>10. Repeat the simulation from exercise 1 100 times and compare the average accuracy for each method and notice they give practically the same answer.</p>
 <p>11. Generate 25 different datasets changing the difference between the two class: <code>delta &lt;- seq(0, 3, len = 25)</code>. Plot accuracy versus <code>delta</code>.</p>
-<p>12. Earlier we used logistic regression to predict sex from height. Use kNN to do the same. Use the code described in this chapter to select the <span class="math inline">\(F_1\)</span> measure and plot it against <span class="math inline">\(k\)</span>. Compare to the <span class="math inline">\(F_1\)</span> of about 0.6 we obtained with regression.</p>
-<p>13. Load the following dataset:</p>
-<p>This dataset includes a matrix <code>x</code>:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-69_26a4ad2d9951415ec315f79c14db4b4e">
-<div class="sourceCode" id="cb83"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>with the gene expression measured on 500 genes for 189 biological samples representing seven different tissues. The tissue type is stored in <code>y</code>:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-70_16b63de442f197d0596b841ac23a426d">
-<div class="sourceCode" id="cb84"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/table.html">table</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">y</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Split the data in training and test sets, then use kNN to predict tissue type and see what accuracy you obtain. Try it for <span class="math inline">\(k = 1, 3, \dots, 11\)</span>.</p>
-<p>14. We are going to apply LDA and QDA to the <code>tissue_gene_expression</code> dataset. We will start with simple examples based on this dataset and then develop a realistic example.</p>
-<p>Create a dataset with just the classes “cerebellum” and “hippocampus” (two parts of the brain) and a predictor matrix with 10 randomly selected columns.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-71_bcd14032d216b1a6e3006afd8f643372">
-<div class="sourceCode" id="cb85"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1993</span><span class="op">)</span></span>
-<span><span class="va">tissues</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"cerebellum"</span>, <span class="st">"hippocampus"</span><span class="op">)</span></span>
-<span><span class="va">ind</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/which.html">which</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">y</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">tissues</span><span class="op">)</span></span>
-<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/droplevels.html">droplevels</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">y</span><span class="op">[</span><span class="va">ind</span><span class="op">]</span><span class="op">)</span></span>
-<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">x</span><span class="op">[</span><span class="va">ind</span>, <span class="op">]</span></span>
-<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[</span>, <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, <span class="fl">10</span><span class="op">)</span><span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Use the <code>train</code> function to estimate the accuracy of LDA.</p>
-<p>15. In this case, LDA fits two 10-dimensional normal distributions. Look at the fitted model by looking at the <code>finalModel</code> component of the result of train. Notice there is a component called <code>means</code> that includes the estimate <code>means</code> of both distributions. Plot the mean vectors against each other and determine which predictors (genes) appear to be driving the algorithm.</p>
-<p>16. Repeat exercises 1 with QDA. Does it have a higher accuracy than LDA?</p>
-<p>17. Are the same predictors (genes) driving the algorithm? Make a plot as in exercise 2.</p>
-<p>18. One thing we see in the previous plot is that the value of predictors correlate in both groups: some predictors are low in both groups while others are high in both groups. The mean value of each predictor, <code>colMeans(x)</code>, is not informative or useful for prediction, and often for interpretation purposes it is useful to center or scale each column. This can be achieved with the <code>preProcessing</code> argument in <code>train</code>. Re-run LDA with <code>preProcessing = "scale"</code>. Note that accuracy does not change but see how it is easier to identify the predictors that differ more between groups in the plot made in exercise 4.</p>
-<p>19. In the previous exercises we saw that both approaches worked well. Plot the predictor values for the two genes with the largest differences between the two groups in a scatterplot to see how they appear to follow a bivariate distribution as assumed by the LDA and QDA approaches. Color the points by the outcome.</p>
-<p>20. Now we are going to increase the complexity of the challenge slightly: we will consider all the tissue types.</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-72_920438187f4c690f709e95f693e5cce1">
-<div class="sourceCode" id="cb86"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1993</span><span class="op">)</span></span>
-<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">y</span></span>
-<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">x</span></span>
-<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[</span>, <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, <span class="fl">10</span><span class="op">)</span><span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>What accuracy do you get with LDA?</p>
-<p>21. We see that the results are slightly worse. Use the <code>confusionMatrix</code> function to learn what type of errors we are making.</p>
-<p>22. Plot an image of the centers of the seven 10-dimensional normal distributions.</p>
-<p>23. Create a simple dataset where the outcome grows 0.75 units on average for every increase in a predictor:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-73_550801dbea1ba752bb71ceb1ba196e14">
-<div class="sourceCode" id="cb87"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
+<p>12. If we add 1s to our 2 or 7 examples, we get data that looks like this:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-37_c29e4c05334d7362ef6e14bcb6c6f3f5">
+<div class="cell-output-display">
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure"><p><img src="algorithms_files/figure-html/unnamed-chunk-37-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Fit QDA using the <code>qda</code> function in the <strong>MASS</strong> package the create a confusion matrix for predictions on the test. Which of the following best describes the confusion matrix:</p>
+<p>a. It is a two-by-two table. b. Because we have three classes, it is a two-by-three table. c. Because we have three classes, it is a three-by-three table. d. Confusion matrices only make sense when the outcomes are binary.</p>
+<p>13. The <code>byClass</code> component returned by the <code>confusionMatrix</code> object provides sensitivity and specificity for each class. Because these terms only make sense when data is binary, each row represents sensitivity and specificity when a particular class is 1 (positives) and the other two are considered 0s (negatives). Based on the values returned by <code>confusionMatrix</code>, which of the following is the most common mistake:</p>
+<p>a. Calling 1s either a 2 or 7. b. Calling 2s either a 1 or 7. c. Calling 7s either a 1 or 2. d. All mistakes are equally common.</p>
+<ol start="14" type="1">
+<li>Create a grid of <code>x_1</code> and <code>x_2</code> using:</li>
+</ol>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-38_ce5f9cbc864d21d5148e93da2ba0c2f5">
+<div class="sourceCode" id="cb36"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">GS</span> <span class="op">&lt;-</span> <span class="fl">150</span></span>
+<span><span class="va">new_x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">mnist_127</span><span class="op">$</span><span class="va">train</span>,</span>
+<span>  <span class="fu"><a href="https://rdrr.io/r/base/expand.grid.html">expand.grid</a></span><span class="op">(</span>x_1 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">min</a></span><span class="op">(</span><span class="va">x_1</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="va">x_1</span><span class="op">)</span>, len <span class="op">=</span> <span class="va">GS</span><span class="op">)</span>,</span>
+<span>              x_2 <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">min</a></span><span class="op">(</span><span class="va">x_2</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="va">x_2</span><span class="op">)</span>, len <span class="op">=</span> <span class="va">GS</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>then visualize the decision rule by coloring the regions of the Cartesian plan to represent the label that would be called in that region.</p>
+<p>14. Repeat exercise 13 but for LDA. Which of the following explains why LDA has worse accuracy:</p>
+<ol type="a">
+<li>LDA separates the space with lines making it too rigid.</li>
+<li>LDA divides the space into two and there are three classes.</li>
+<li>LDA is very similar to QDA the difference is due to chance.</li>
+<li>LDA can’t be used with more than one class.</li>
+</ol>
+<p>15. Now repeat exercise 13 for kNN with <span class="math inline">\(k=31\)</span> and compute and compare the overall accuracy for all three methods.</p>
+<ol start="16" type="1">
+<li>To understand how a simple method like kNN can outperform a model that explicitly tries to emulate Bayes’ rule, explore the conditional distributions of <code>x_1</code> and <code>x_2</code> to see if the normal approximation holds. Generative models can be very powerful, but only when we are able to successfully approximate the joint distribution of predictors conditioned on each class.</li>
+</ol>
+<p>17. Earlier we used logistic regression to predict sex from height. Use kNN to do the same. Use the code described in this chapter to select the <span class="math inline">\(F_1\)</span> measure and plot it against <span class="math inline">\(k\)</span>. Compare to the <span class="math inline">\(F_1\)</span> of about 0.6 we obtained with regression.</p>
+<p>18. Create a simple dataset where the outcome grows 0.75 units on average for every increase in a predictor:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-39_ad38a41d13db288d09f68d7983ebda69">
+<div class="sourceCode" id="cb37"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
 <span><span class="va">sigma</span> <span class="op">&lt;-</span> <span class="fl">0.25</span></span>
 <span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">n</span>, <span class="fl">0</span>, <span class="fl">1</span><span class="op">)</span></span>
 <span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fl">0.75</span> <span class="op">*</span> <span class="va">x</span> <span class="op">+</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">n</span>, <span class="fl">0</span>, <span class="va">sigma</span><span class="op">)</span></span>
 <span><span class="va">dat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">x</span>, y <span class="op">=</span> <span class="va">y</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Use <code>rpart</code> to fit a regression tree and save the result to <code>fit</code>.</p>
-<p>24. Plot the final tree so that you can see where the partitions occurred.</p>
-<p>25. Make a scatterplot of <code>y</code> versus <code>x</code> along with the predicted values based on the fit.</p>
-<p>26. Now model with a random forest instead of a regression tree using <code>randomForest</code> from the <strong>randomForest</strong> package, and remake the scatterplot with the prediction line.</p>
-<p>27. Use the function <code>plot</code> to see if the random forest has converged or if we need more trees.</p>
-<p>28. It seems that the default values for the random forest result in an estimate that is too flexible (not smooth). Re-run the random forest but this time with <code>nodesize</code> set at 50 and <code>maxnodes</code> set at 25. Remake the plot.</p>
-<p>29. We see that this yields smoother results. Let’s use the <code>train</code> function to help us pick these values. From the <strong>caret</strong> manual<a href="#fn6" class="footnote-ref" id="fnref6" role="doc-noteref"><sup>6</sup></a> we see that we can’t tune the <code>maxnodes</code> parameter or the <code>nodesize</code> argument with <code>randomForest</code>, so we will use the <strong>Rborist</strong> package and tune the <code>minNode</code> argument. Use the <code>train</code> function to try values <code>minNode &lt;- seq(5, 250, 25)</code>. See which value minimizes the estimated RMSE.</p>
-<p>30. Make a scatterplot along with the prediction from the best fitted model.</p>
-<p>31. Use the <code>rpart</code> function to fit a classification tree to the <code>tissue_gene_expression</code> dataset. Use the <code>train</code> function to estimate the accuracy. Try out <code>cp</code> values of <code>seq(0, 0.05, 0.01)</code>. Plot the accuracy to report the results of the best model.</p>
-<p>32. Study the confusion matrix for the best fitting classification tree. What do you observe happening for placenta?</p>
-<p>33. Notice that placentas are called endometrium more often than placenta. Note also that the number of placentas is just six, and that, by default, <code>rpart</code> requires 20 observations before splitting a node. Thus it is not possible with these parameters to have a node in which placentas are the majority. Rerun the above analysis but this time permit <code>rpart</code> to split any node by using the argument <code>control = rpart.control(minsplit = 0)</code>. Does the accuracy increase? Look at the confusion matrix again.</p>
-<p>34. Plot the tree from the best fitting model obtained in exercise 11.</p>
-<p>35. We can see that with just six genes, we are able to predict the tissue type. Now let’s see if we can do even better with a random forest. Use the <code>train</code> function and the <code>rf</code> method to train a random forest. Try out values of <code>mtry</code> ranging from, at least, <code>seq(50, 200, 25)</code>. What <code>mtry</code> value maximizes accuracy? To permit small <code>nodesize</code> to grow as we did with the classification trees, use the following argument: <code>nodesize = 1</code>. This will take several seconds to run. If you want to test it out, try using smaller values with <code>ntree</code>. Set the seed to 1990.</p>
-<p>36. Use the function <code>varImp</code> on the output of <code>train</code> and save it to an object called <code>imp</code>.</p>
-<p>37. The <code>rpart</code> model we ran above produced a tree that used just six predictors. Extracting the predictor names is not straightforward, but can be done. If the output of the call to train was <code>fit_rpart</code>, we can extract the names like this:</p>
-<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-74_a2a374da6d9b46540bfb921216dd29b8">
-<div class="sourceCode" id="cb88"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">ind</span> <span class="op">&lt;-</span> <span class="op">!</span><span class="op">(</span><span class="va">fit_rpart</span><span class="op">$</span><span class="va">finalModel</span><span class="op">$</span><span class="va">frame</span><span class="op">$</span><span class="va">var</span> <span class="op">==</span> <span class="st">"&lt;leaf&gt;"</span><span class="op">)</span></span>
-<span><span class="va">tree_terms</span> <span class="op">&lt;-</span> </span>
-<span>  <span class="va">fit_rpart</span><span class="op">$</span><span class="va">finalModel</span><span class="op">$</span><span class="va">frame</span><span class="op">$</span><span class="va">var</span><span class="op">[</span><span class="va">ind</span><span class="op">]</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/base/unique.html">unique</a></span><span class="op">(</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/base/character.html">as.character</a></span><span class="op">(</span><span class="op">)</span></span>
-<span><span class="va">tree_terms</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>What is the variable importance in the random forest call for these predictors? Where do they rank?</p>
-<p>38. Advanced: Extract the top 50 predictors based on importance, take a subset of <code>x</code> with just these predictors and apply the function <code>heatmap</code> to see how these genes behave across the tissues. We will introduce the <code>heatmap</code> function in Chapter <a href="clustering.html"><span>Chapter&nbsp;31</span></a>.</p>
+<p>19. Plot the final tree so that you can see where the partitions occurred.</p>
+<p>20. Make a scatterplot of <code>y</code> versus <code>x</code> along with the predicted values based on the fit.</p>
+<p>21. Now model with a random forest instead of a regression tree using <code>randomForest</code> from the <strong>randomForest</strong> package, and remake the scatterplot with the prediction line.</p>
+<p>22. Use the function <code>plot</code> to see if the random forest has converged or if we need more trees.</p>
+<p>23. It seems that the default values for the random forest result in an estimate that is too flexible (not smooth). Re-run the random forest but this time with <code>nodesize</code> set at 50 and <code>maxnodes</code> set at 25. Remake the plot.</p>
+<p>24. This **dslabs* dataset includes the <code>tissue_gene_expression</code> with a matrix <code>x</code>:</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-40_6ad6c090ec48fc049b99173ae87fe2f2">
+<div class="sourceCode" id="cb38"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>with the gene expression measured on 500 genes for 189 biological samples representing seven different tissues. The tissue type is stored in <code>tissue_gene_expression$y</code>.</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-41_39d488a980e2b68dc327b21910d97eea">
+<div class="sourceCode" id="cb39"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/table.html">table</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">y</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Fit a random forest using the <code>randomForest</code> function in the package <strong>randomForest</strong>. Then use the <code>varImp</code> function to see which are the top 10 most predictive genes. Make a histogram of the reported importance to get an idea of the distribution of the importance values.</p>
+<div class="cell" data-layout-align="center" data-hash="algorithms_cache/html/unnamed-chunk-42_879bd2d158d75df65c1cf3e0e6ebab49">
+<div class="sourceCode" id="cb40"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://www.stat.berkeley.edu/~breiman/RandomForests/">randomForest</a></span><span class="op">)</span></span>
+<span><span class="va">fit_rf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span>, <span class="fu"><a href="https://rdrr.io/pkg/randomForest/man/randomForest.html">randomForest</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">y</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">vi</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/varImp.html">varImp</a></span><span class="op">(</span><span class="va">fit_rf</span><span class="op">)</span> </span>
+<span><span class="va">vi</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/top_n.html">top_n</a></span><span class="op">(</span><span class="fl">10</span>, <span class="va">Overall</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/arrange.html">arrange</a></span><span class="op">(</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/desc.html">desc</a></span><span class="op">(</span><span class="va">Overall</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="co">#&gt;          Overall</span></span>
+<span><span class="co">#&gt; GPA33       3.61</span></span>
+<span><span class="co">#&gt; KIF2C       2.65</span></span>
+<span><span class="co">#&gt; CLIP3       2.55</span></span>
+<span><span class="co">#&gt; RARRES2     2.29</span></span>
+<span><span class="co">#&gt; COLGALT2    2.27</span></span>
+<span><span class="co">#&gt; LRRN3       2.26</span></span>
+<span><span class="co">#&gt; CEP55       2.21</span></span>
+<span><span class="co">#&gt; GTF2IRD1    2.15</span></span>
+<span><span class="co">#&gt; KCTD2       2.12</span></span>
+<span><span class="co">#&gt; LTBR        2.01</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">vi</span><span class="op">$</span><span class="va">Overall</span>, breaks <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">4</span>,<span class="fl">0.1</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output-display">
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure"><p><img src="algorithms_files/figure-html/unnamed-chunk-42-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
+</figure>
+</div>
+</div>
+</div>
 
 
 </section><section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes"><hr>
 <ol>
-<li id="fn1"><p>http://www.amazon.com/Mathematical-Statistics-Analysis-Available-Enhanced/dp/0534399428<a href="#fnref1" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
-<li id="fn2"><p>https://web.stanford.edu/~hastie/Papers/ESLII.pdf<a href="#fnref2" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
-<li id="fn3"><p>https://papers.ssrn.com/sol3/Delivery.cfm/SSRN_ID1759289_code1486039.pdf?abstractid = 1759289&amp;mirid = 1&amp;type = 2<a href="#fnref3" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
-<li id="fn4"><p>http://topepo.github.io/caret/available-models.html<a href="#fnref4" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
-<li id="fn5"><p>https://web.stanford.edu/~hastie/Papers/ESLII.pdf<a href="#fnref5" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
-<li id="fn6"><p>https://topepo.github.io/caret/available-models.html<a href="#fnref6" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
+<li id="fn1"><p>https://web.stanford.edu/~hastie/Papers/ESLII.pdf<a href="#fnref1" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
+<li id="fn2"><p>https://papers.ssrn.com/sol3/Delivery.cfm/SSRN_ID1759289_code1486039.pdf?abstractid = 1759289&amp;mirid = 1&amp;type = 2<a href="#fnref2" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
+<li id="fn3"><p>https://web.stanford.edu/~hastie/Papers/ESLII.pdf<a href="#fnref3" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
 </ol></section></main><!-- /main --><script id="quarto-html-after-body" type="application/javascript">
 window.document.addEventListener("DOMContentLoaded", function (event) {
   const toggleBodyColorMode = (bsSheetEl) => {
@@ -1969,12 +1421,12 @@ <h1 class="title">
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../ml/cross-validation.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../ml/ml-in-practice.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/ml/algorithms_files/figure-html/acc-versus-nodesize-1.png b/docs/ml/algorithms_files/figure-html/acc-versus-nodesize-1.png
deleted file mode 100644
index 7407c20..0000000
Binary files a/docs/ml/algorithms_files/figure-html/acc-versus-nodesize-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/best-knn-fit-1.png b/docs/ml/algorithms_files/figure-html/best-knn-fit-1.png
index 4379ba9..314ef81 100644
Binary files a/docs/ml/algorithms_files/figure-html/best-knn-fit-1.png and b/docs/ml/algorithms_files/figure-html/best-knn-fit-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/cond-prob-final-rf-1.png b/docs/ml/algorithms_files/figure-html/cond-prob-final-rf-1.png
index e6f12aa..58f1c91 100644
Binary files a/docs/ml/algorithms_files/figure-html/cond-prob-final-rf-1.png and b/docs/ml/algorithms_files/figure-html/cond-prob-final-rf-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/cond-prob-rf-1.png b/docs/ml/algorithms_files/figure-html/cond-prob-rf-1.png
index fae0538..298b3e5 100644
Binary files a/docs/ml/algorithms_files/figure-html/cond-prob-rf-1.png and b/docs/ml/algorithms_files/figure-html/cond-prob-rf-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/conditional-prob-glm-fit-1.png b/docs/ml/algorithms_files/figure-html/conditional-prob-glm-fit-1.png
deleted file mode 100644
index dd37a30..0000000
Binary files a/docs/ml/algorithms_files/figure-html/conditional-prob-glm-fit-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/curse-of-dim-4-1.png b/docs/ml/algorithms_files/figure-html/curse-of-dim-4-1.png
index df1f614..b490844 100644
Binary files a/docs/ml/algorithms_files/figure-html/curse-of-dim-4-1.png and b/docs/ml/algorithms_files/figure-html/curse-of-dim-4-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/glm-prediction-1.png b/docs/ml/algorithms_files/figure-html/glm-prediction-1.png
deleted file mode 100644
index dd4b28d..0000000
Binary files a/docs/ml/algorithms_files/figure-html/glm-prediction-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/height-and-sex-conditional-probabilities-1.png b/docs/ml/algorithms_files/figure-html/height-and-sex-conditional-probabilities-1.png
deleted file mode 100644
index 6d2b977..0000000
Binary files a/docs/ml/algorithms_files/figure-html/height-and-sex-conditional-probabilities-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/lda-estimate-1.png b/docs/ml/algorithms_files/figure-html/lda-estimate-1.png
index ee129ed..624292d 100644
Binary files a/docs/ml/algorithms_files/figure-html/lda-estimate-1.png and b/docs/ml/algorithms_files/figure-html/lda-estimate-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/lda-explained-1.png b/docs/ml/algorithms_files/figure-html/lda-explained-1.png
index e9500e6..2a3b623 100644
Binary files a/docs/ml/algorithms_files/figure-html/lda-explained-1.png and b/docs/ml/algorithms_files/figure-html/lda-explained-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/lda-too-rigid-1.png b/docs/ml/algorithms_files/figure-html/lda-too-rigid-1.png
deleted file mode 100644
index 6ce43f4..0000000
Binary files a/docs/ml/algorithms_files/figure-html/lda-too-rigid-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/logistic-p-hat-1.png b/docs/ml/algorithms_files/figure-html/logistic-p-hat-1.png
index 07c26b3..5a2b510 100644
Binary files a/docs/ml/algorithms_files/figure-html/logistic-p-hat-1.png and b/docs/ml/algorithms_files/figure-html/logistic-p-hat-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/mnist-27-training-data-1.png b/docs/ml/algorithms_files/figure-html/mnist-27-training-data-1.png
deleted file mode 100644
index 0aed070..0000000
Binary files a/docs/ml/algorithms_files/figure-html/mnist-27-training-data-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/mnist-27-tree-1.png b/docs/ml/algorithms_files/figure-html/mnist-27-tree-1.png
deleted file mode 100644
index 88d316d..0000000
Binary files a/docs/ml/algorithms_files/figure-html/mnist-27-tree-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/more-trees-better-fit-1.png b/docs/ml/algorithms_files/figure-html/more-trees-better-fit-1.png
index 8c5311a..6b30477 100644
Binary files a/docs/ml/algorithms_files/figure-html/more-trees-better-fit-1.png and b/docs/ml/algorithms_files/figure-html/more-trees-better-fit-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/naive-with-good-prevalence-1.png b/docs/ml/algorithms_files/figure-html/naive-with-good-prevalence-1.png
index 49f542a..842a736 100644
Binary files a/docs/ml/algorithms_files/figure-html/naive-with-good-prevalence-1.png and b/docs/ml/algorithms_files/figure-html/naive-with-good-prevalence-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/olive-knn-1.png b/docs/ml/algorithms_files/figure-html/olive-knn-1.png
deleted file mode 100644
index 5814770..0000000
Binary files a/docs/ml/algorithms_files/figure-html/olive-knn-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/p-versus-logistic-of-p-1.png b/docs/ml/algorithms_files/figure-html/p-versus-logistic-of-p-1.png
deleted file mode 100644
index b76a29e..0000000
Binary files a/docs/ml/algorithms_files/figure-html/p-versus-logistic-of-p-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/polls-2008-final-fit-1.png b/docs/ml/algorithms_files/figure-html/polls-2008-final-fit-1.png
index e092b32..3f44e61 100644
Binary files a/docs/ml/algorithms_files/figure-html/polls-2008-final-fit-1.png and b/docs/ml/algorithms_files/figure-html/polls-2008-final-fit-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/polls-2008-final-model-1.png b/docs/ml/algorithms_files/figure-html/polls-2008-final-model-1.png
deleted file mode 100644
index c910954..0000000
Binary files a/docs/ml/algorithms_files/figure-html/polls-2008-final-model-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/polls-2008-rf-fit-1.png b/docs/ml/algorithms_files/figure-html/polls-2008-rf-fit-1.png
index 1449e23..7d96c49 100644
Binary files a/docs/ml/algorithms_files/figure-html/polls-2008-rf-fit-1.png and b/docs/ml/algorithms_files/figure-html/polls-2008-rf-fit-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/polls-2008-tree-train-1.png b/docs/ml/algorithms_files/figure-html/polls-2008-tree-train-1.png
deleted file mode 100644
index 714b369..0000000
Binary files a/docs/ml/algorithms_files/figure-html/polls-2008-tree-train-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/qda-does-not-fit-1.png b/docs/ml/algorithms_files/figure-html/qda-does-not-fit-1.png
index ec2b9dc..4feaa42 100644
Binary files a/docs/ml/algorithms_files/figure-html/qda-does-not-fit-1.png and b/docs/ml/algorithms_files/figure-html/qda-does-not-fit-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/qda-estimate-1.png b/docs/ml/algorithms_files/figure-html/qda-estimate-1.png
index 90a7f51..00478fd 100644
Binary files a/docs/ml/algorithms_files/figure-html/qda-estimate-1.png and b/docs/ml/algorithms_files/figure-html/qda-estimate-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/qda-explained-1.png b/docs/ml/algorithms_files/figure-html/qda-explained-1.png
index 48238d2..65d36dc 100644
Binary files a/docs/ml/algorithms_files/figure-html/qda-explained-1.png and b/docs/ml/algorithms_files/figure-html/qda-explained-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/regression-prediction-1.png b/docs/ml/algorithms_files/figure-html/regression-prediction-1.png
deleted file mode 100644
index b05269e..0000000
Binary files a/docs/ml/algorithms_files/figure-html/regression-prediction-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/rf-cond-prob-1.png b/docs/ml/algorithms_files/figure-html/rf-cond-prob-1.png
index 7e04087..7fb3178 100644
Binary files a/docs/ml/algorithms_files/figure-html/rf-cond-prob-1.png and b/docs/ml/algorithms_files/figure-html/rf-cond-prob-1.png differ
diff --git a/docs/ml/algorithms_files/figure-html/three-classes-knn-better-1.png b/docs/ml/algorithms_files/figure-html/three-classes-knn-better-1.png
deleted file mode 100644
index 275d290..0000000
Binary files a/docs/ml/algorithms_files/figure-html/three-classes-knn-better-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/three-classes-lack-of-fit-1.png b/docs/ml/algorithms_files/figure-html/three-classes-lack-of-fit-1.png
deleted file mode 100644
index 6715612..0000000
Binary files a/docs/ml/algorithms_files/figure-html/three-classes-lack-of-fit-1.png and /dev/null differ
diff --git a/docs/ml/algorithms_files/figure-html/three-classes-plot-1.png b/docs/ml/algorithms_files/figure-html/three-classes-plot-1.png
deleted file mode 100644
index d521deb..0000000
Binary files a/docs/ml/algorithms_files/figure-html/three-classes-plot-1.png and /dev/null differ
diff --git a/docs/ml/clustering.html b/docs/ml/clustering.html
index 1cb4ba9..4688976 100644
--- a/docs/ml/clustering.html
+++ b/docs/ml/clustering.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 31&nbsp; Clustering</title>
+<title>Advanced Data Science - 32&nbsp; Clustering</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -97,7 +97,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/clustering.html"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/clustering.html"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -222,23 +222,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -255,37 +261,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -302,31 +308,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -343,49 +349,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -398,16 +404,20 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#hierarchical-clustering" id="toc-hierarchical-clustering" class="nav-link active" data-scroll-target="#hierarchical-clustering"><span class="header-section-number">31.1</span> Hierarchical clustering</a></li>
-  <li><a href="#k-means" id="toc-k-means" class="nav-link" data-scroll-target="#k-means"><span class="header-section-number">31.2</span> k-means</a></li>
-  <li><a href="#heatmaps" id="toc-heatmaps" class="nav-link" data-scroll-target="#heatmaps"><span class="header-section-number">31.3</span> Heatmaps</a></li>
-  <li><a href="#filtering-features" id="toc-filtering-features" class="nav-link" data-scroll-target="#filtering-features"><span class="header-section-number">31.4</span> Filtering features</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">31.5</span> Exercises</a></li>
+<li><a href="#hierarchical-clustering" id="toc-hierarchical-clustering" class="nav-link active" data-scroll-target="#hierarchical-clustering"><span class="header-section-number">32.1</span> Hierarchical clustering</a></li>
+  <li><a href="#k-means" id="toc-k-means" class="nav-link" data-scroll-target="#k-means"><span class="header-section-number">32.2</span> k-means</a></li>
+  <li>
+<a href="#heatmaps" id="toc-heatmaps" class="nav-link" data-scroll-target="#heatmaps"><span class="header-section-number">32.3</span> Heatmaps</a>
+  <ul class="collapse">
+<li><a href="#filtering-features" id="toc-filtering-features" class="nav-link" data-scroll-target="#filtering-features"><span class="header-section-number">32.3.1</span> Filtering features</a></li>
+  </ul>
+</li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">32.4</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/ml/clustering.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
-<h1 class="title"><span id="sec-clustering" class="quarto-section-identifier"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></h1>
+<h1 class="title"><span id="sec-clustering" class="quarto-section-identifier"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></h1>
 </div>
 
 
@@ -420,46 +430,53 @@ <h1 class="title"><span id="sec-clustering" class="quarto-section-identifier"><s
   </div>
   
 
-</header><p>The algorithms we have described up to now are examples of a general approach referred to as <em>supervised</em> machine learning. The name comes from the fact that we use the outcomes in a training set to <em>supervise</em> the creation of our prediction algorithm. There is another subset of machine learning referred to as <em>unsupervised</em>. In this subset we do not necessarily know the outcomes and instead are interested in discovering groups. These algorithms are also referred to as <em>clustering</em> algorithms since predictors are used to define <em>clusters</em>.</p>
-<p>In the two examples we have shown here, clustering would not be very useful. In the first example, if we are simply given the heights we will not be able to discover two groups, males and females, because the intersection is large. In the second example, we can see from plotting the predictors that discovering the two digits, two and seven, will be challenging:</p>
-<div class="cell" data-layout-align="center">
-<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
-<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
-<span><span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span>, color <span class="op">=</span> <span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="clustering_files/figure-html/mnist-27-unsupervised-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
+</header><p>The algorithms we have described up to now are examples of a general approach referred to as <em>supervised</em> machine learning. The name comes from the fact that we use the outcomes in a training set to <em>supervise</em> the creation of our prediction algorithm. There is another subset of machine learning referred to as <em>unsupervised</em>. In this subset, we do not necessarily know the outcomes and instead are interested in discovering groups. These algorithms are also referred to as <em>clustering</em> algorithms since predictors are used to define <em>clusters</em>.</p>
+<p>In the two examples we have shown here, clustering would not be very useful. In the first example, if we are simply given the heights, we will not be able to discover two groups, males and females, because the intersection is large. In the second example, we can see from plotting the predictors that discovering the two digits, two and seven, will be challenging.</p>
 <p>However, there are applications in which unsupervised learning can be a powerful technique, in particular as an exploratory tool.</p>
 <p>A first step in any clustering algorithm is defining a distance between observations or groups of observations. Then we need to decide how to join observations into clusters. There are many algorithms for doing this. Here we introduce two as examples: hierarchical and k-means.</p>
 <p>We will construct a simple example based on movie ratings. Here we quickly construct a matrix <code>x</code> that has ratings for the 50 movies with the most ratings.</p>
-<div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-1_f26473064a48d1fc9b3df393fb901546">
-<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">top_movies</span> <span class="op">&lt;-</span> <span class="va">movielens</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/count.html">count</a></span><span class="op">(</span><span class="va">movieId</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/top_n.html">top_n</a></span><span class="op">(</span><span class="fl">50</span>, <span class="va">n</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">movieId</span><span class="op">)</span></span>
-<span><span class="va">top_users</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">movielens</span>, <span class="va">movieId</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">top_movies</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/count.html">count</a></span><span class="op">(</span><span class="va">userId</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/top_n.html">top_n</a></span><span class="op">(</span><span class="fl">50</span>, <span class="va">n</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">userId</span><span class="op">)</span></span>
-<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">movielens</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">movieId</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">top_movies</span> <span class="op">&amp;</span> <span class="va">userId</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">top_users</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<div class="cell" data-layout-align="center">
+<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
+<span><span class="co">#&gt; ── Attaching core tidyverse packages ──────────────── tidyverse 2.0.0 ──</span></span>
+<span><span class="co">#&gt; ✔ dplyr     1.1.1     ✔ readr     2.1.4</span></span>
+<span><span class="co">#&gt; ✔ forcats   1.0.0     ✔ stringr   1.5.0</span></span>
+<span><span class="co">#&gt; ✔ lubridate 1.9.2     ✔ tibble    3.2.1</span></span>
+<span><span class="co">#&gt; ✔ purrr     1.0.1     ✔ tidyr     1.3.0</span></span>
+<span><span class="co">#&gt; ── Conflicts ────────────────────────────────── tidyverse_conflicts() ──</span></span>
+<span><span class="co">#&gt; ✖ dplyr::filter() masks stats::filter()</span></span>
+<span><span class="co">#&gt; ✖ dplyr::lag()    masks stats::lag()</span></span>
+<span><span class="co">#&gt; ℹ Use the conflicted package (&lt;http://conflicted.r-lib.org/&gt;) to force all conflicts to become errors</span></span>
+<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/sfirke/janitor">janitor</a></span><span class="op">)</span></span>
+<span><span class="co">#&gt; </span></span>
+<span><span class="co">#&gt; Attaching package: 'janitor'</span></span>
+<span><span class="co">#&gt; </span></span>
+<span><span class="co">#&gt; The following objects are masked from 'package:stats':</span></span>
+<span><span class="co">#&gt; </span></span>
+<span><span class="co">#&gt;     chisq.test, fisher.test</span></span>
+<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
+<span><span class="va">top_movies</span> <span class="op">&lt;-</span> <span class="va">movielens</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/count.html">count</a></span><span class="op">(</span><span class="va">movieId</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/top_n.html">top_n</a></span><span class="op">(</span><span class="fl">50</span>, <span class="va">n</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">movieId</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">movielens</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">movieId</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">top_movies</span><span class="op">)</span> </span>
+<span><span class="va">top_users</span> <span class="op">&lt;-</span> <span class="va">x</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/count.html">count</a></span><span class="op">(</span><span class="va">userId</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/top_n.html">top_n</a></span><span class="op">(</span><span class="fl">50</span>, <span class="va">n</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">userId</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">x</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">userId</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">top_users</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/select.html">select</a></span><span class="op">(</span><span class="va">title</span>, <span class="va">userId</span>, <span class="va">rating</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://tidyr.tidyverse.org/reference/pivot_wider.html">pivot_wider</a></span><span class="op">(</span>names_from <span class="op">=</span> <span class="va">userId</span>, values_from <span class="op">=</span> <span class="va">rating</span><span class="op">)</span></span>
-<span><span class="va">row_names</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://stringr.tidyverse.org/reference/str_remove.html">str_remove</a></span><span class="op">(</span><span class="va">x</span><span class="op">$</span><span class="va">title</span>, <span class="st">": Episode"</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://stringr.tidyverse.org/reference/str_trunc.html">str_trunc</a></span><span class="op">(</span><span class="fl">20</span><span class="op">)</span></span>
-<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[</span>,<span class="op">-</span><span class="fl">1</span><span class="op">]</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">as.matrix</a></span><span class="op">(</span><span class="op">)</span></span>
-<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">1</span>, <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowMeans</a></span><span class="op">(</span><span class="va">x</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">rownames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="va">row_names</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span>  <span class="fu"><a href="https://tidyr.tidyverse.org/reference/pivot_wider.html">pivot_wider</a></span><span class="op">(</span>names_from <span class="op">=</span> <span class="va">userId</span>, values_from <span class="op">=</span> <span class="va">rating</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://tibble.tidyverse.org/reference/rownames.html">column_to_rownames</a></span><span class="op">(</span><span class="st">"title"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">as.matrix</a></span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">rownames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://stringr.tidyverse.org/reference/str_remove.html">str_remove</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">rownames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, <span class="st">": Episode"</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://stringr.tidyverse.org/reference/str_trunc.html">str_trunc</a></span><span class="op">(</span><span class="fl">20</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We want to use these data to find out if there are clusters of movies based on the ratings from 51 movie raters. A first step is to find the distance between each pair of movies using the <code>dist</code> function:</p>
 <div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-2_23cf077f3aed45c923cc66e7043d812e">
-<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">d</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">d</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<section id="hierarchical-clustering" class="level2" data-number="31.1"><h2 data-number="31.1" class="anchored" data-anchor-id="hierarchical-clustering">
-<span class="header-section-number">31.1</span> Hierarchical clustering</h2>
-<p>With the distance between each pair of movies computed, we need an algorithm to define groups from these. Hierarchical clustering starts by defining each observation as a separate group, then the two closest groups are joined into a group iteratively until there is just one group including all the observations. The <code>hclust</code> function implements this algorithm and it takes a distance as input.</p>
+<section id="hierarchical-clustering" class="level2" data-number="32.1"><h2 data-number="32.1" class="anchored" data-anchor-id="hierarchical-clustering">
+<span class="header-section-number">32.1</span> Hierarchical clustering</h2>
+<p>With the distance between each pair of movies computed, we need an algorithm to define groups, based on these distances. Hierarchical clustering starts by defining each observation as a separate group, then the two closest groups are joined into new groups. We then continue joining the closest groups into new groups iteratively until there is just one group including all the observations. The <code>hclust</code> function implements this algorithm and takes a distance as input.</p>
 <div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-3_0bfaebb0ffa7c2a65065aebec4becdbd">
-<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">h</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/hclust.html">hclust</a></span><span class="op">(</span><span class="va">d</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">h</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/hclust.html">hclust</a></span><span class="op">(</span><span class="va">d</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can see the resulting groups using a <em>dendrogram</em>.</p>
+<p>We can see the resulting groups using a <em>dendrogram</em>. The funciton <code>plot</code> applied to an <code>hclust</code> object creates a dendrogram:</p>
 <div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-4_991229deb6969d72751c5cbc29300ad4">
-<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">h</span>, cex <span class="op">=</span> <span class="fl">0.65</span>, main <span class="op">=</span> <span class="st">""</span>, xlab <span class="op">=</span> <span class="st">""</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">h</span>, cex <span class="op">=</span> <span class="fl">0.65</span>, main <span class="op">=</span> <span class="st">""</span>, xlab <span class="op">=</span> <span class="st">""</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <div class="cell" data-layout-align="center" data-hash="clustering_cache/html/dendrogram_f338ea61491b31fe9e131b4a5daccb53">
 <div class="cell-output-display">
@@ -469,66 +486,70 @@ <h1 class="title"><span id="sec-clustering" class="quarto-section-identifier"><s
 </div>
 </div>
 </div>
-<p>This graph gives us an approximation between the distance between any two movies. To find this distance we find the first location, from top to bottom, where these movies split into two different groups. The height of this location is the distance between these two groups. So, for example, the distance between the three <em>Star Wars</em> movies is 8 or less, while the distance between <em>Raiders of the Lost of Ark</em> and <em>Silence of the Lambs</em> is about 17.</p>
-<p>To generate actual groups we can do one of two things: 1) decide on a minimum distance needed for observations to be in the same group or 2) decide on the number of groups you want and then find the minimum distance that achieves this. The function <code>cutree</code> can be applied to the output of <code>hclust</code> to perform either of these two operations and generate groups.</p>
+<p>This graph gives us an approximation between the distance between any two movies. To find this distance, we find the first location, from top to bottom, where these movies split into two different groups. The height of this location is the distance between these two groups. So, for example, the distance between the three <em>Star Wars</em> movies is 8 or less, while the distance between <em>Raiders of the Lost of Ark</em> and <em>Silence of the Lambs</em> is about 17.</p>
+<p>To generate actual groups, we can do one of two things: 1) decide on a minimum distance needed for observations to be in the same group or 2) decide on the number of groups you want and then find the minimum distance that achieves this. The function <code>cutree</code> can be applied to the output of <code>hclust</code> to perform either of these two operations and generate groups.</p>
 <div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-5_229f570776542ff86cb21c3b6954e300">
-<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">groups</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/cutree.html">cutree</a></span><span class="op">(</span><span class="va">h</span>, k <span class="op">=</span> <span class="fl">10</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">groups</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/cutree.html">cutree</a></span><span class="op">(</span><span class="va">h</span>, k <span class="op">=</span> <span class="fl">10</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Note that the clustering provides some insights into types of movies. Group 4 appears to be blockbusters:</p>
 <div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-6_dad4f473e1d52f6219117cb4e3857785">
-<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">groups</span><span class="op">)</span><span class="op">[</span><span class="va">groups</span> <span class="op">==</span> <span class="fl">4</span><span class="op">]</span></span>
+<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">groups</span><span class="op">)</span><span class="op">[</span><span class="va">groups</span> <span class="op">==</span> <span class="fl">4</span><span class="op">]</span></span>
 <span><span class="co">#&gt; [1] "Braveheart"        "Godfather, The"    "Good Will Hunting"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>And group 9 appears to be nerd movies:</p>
-<div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-7_e77e66cf5dd37403104295fa8f719aa8">
-<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">groups</span><span class="op">)</span><span class="op">[</span><span class="va">groups</span> <span class="op">==</span> <span class="fl">9</span><span class="op">]</span></span>
-<span><span class="co">#&gt; [1] "True Lies"            "Fugitive, The"        "Groundhog Day"       </span></span>
-<span><span class="co">#&gt; [4] "Men in Black (a.k..."</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>And Group 9 appears to be nerd movies:</p>
+<div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-7_dcefac641ec0eb50f51cbeb58e93487c">
+<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">groups</span><span class="op">)</span><span class="op">[</span><span class="va">groups</span> <span class="op">==</span> <span class="fl">6</span><span class="op">]</span></span>
+<span><span class="co">#&gt; [1] "Star Wars IV - A ..." "Star Wars V - The..." "Raiders of the Lo..."</span></span>
+<span><span class="co">#&gt; [4] "Star Wars VI - Re..." "Lord of the Rings..." "Lord of the Rings..."</span></span>
+<span><span class="co">#&gt; [7] "Lord of the Rings..."</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can change the size of the group by either making <code>k</code> larger or <code>h</code> smaller. We can also explore the data to see if there are clusters of movie raters.</p>
-<div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-8_73b2d8e5b722a77c3d334a3054f69800">
-<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">h_2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/stats/hclust.html">hclust</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>We can change the size of the group by either making <code>k</code> larger or <code>h</code> smaller.</p>
+<p>We can also explore the data to see if there are clusters of movie raters:</p>
+<div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-8_5b34657491ba5def19fda80a42147e3e">
+<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">h_2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/hclust.html">hclust</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="k-means" class="level2" data-number="31.2"><h2 data-number="31.2" class="anchored" data-anchor-id="k-means">
-<span class="header-section-number">31.2</span> k-means</h2>
-<p>To use the k-means clustering algorithm we have to pre-define <span class="math inline">\(k\)</span>, the number of clusters we want to define. The k-means algorithm is iterative. The first step is to define <span class="math inline">\(k\)</span> centers. Then each observation is assigned to the cluster with the closest center to that observation. In a second step the centers are redefined using the observation in each cluster: the column means are used to define a <em>centroid</em>. We repeat these two steps until the centers converge.</p>
-<p>The <code>kmeans</code> function included in R-base does not handle NAs. For illustrative purposes we will fill out the NAs with 0s. In general, the choice of how to fill in missing data, or if one should do it at all, should be made with care.</p>
+</section><section id="k-means" class="level2" data-number="32.2"><h2 data-number="32.2" class="anchored" data-anchor-id="k-means">
+<span class="header-section-number">32.2</span> k-means</h2>
+<p>To use the k-means clustering algorithm we have to pre-define <span class="math inline">\(k\)</span>, the number of clusters we want to define. The k-means algorithm is iterative.</p>
+<p>The first step is to define <span class="math inline">\(k\)</span> centers. Then each observation is assigned to the cluster with the closest center to that observation. In a second step, the centers are redefined using the observation in each cluster: the column means are used to define a <em>centroid</em>. We repeat these two steps until the centers converge.</p>
+<p>The <code>kmeans</code> function included in R-base does not handle NAs. For illustrative purposes, we will fill out the NAs with 0s. In general, the choice of how to fill in missing data, or if one should do it at all, should be made with care.</p>
 <div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-9_62dc15304378a08b986b3cb0a96ce052">
-<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x_0</span> <span class="op">&lt;-</span> <span class="va">x</span></span>
+<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x_0</span> <span class="op">&lt;-</span> <span class="va">x</span></span>
 <span><span class="va">x_0</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/NA.html">is.na</a></span><span class="op">(</span><span class="va">x_0</span><span class="op">)</span><span class="op">]</span> <span class="op">&lt;-</span> <span class="fl">0</span></span>
 <span><span class="va">k</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/kmeans.html">kmeans</a></span><span class="op">(</span><span class="va">x_0</span>, centers <span class="op">=</span> <span class="fl">10</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The cluster assignments are in the <code>cluster</code> component:</p>
 <div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-10_262262943b0db55e9bd610086f4f5481">
-<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">groups</span> <span class="op">&lt;-</span> <span class="va">k</span><span class="op">$</span><span class="va">cluster</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">groups</span> <span class="op">&lt;-</span> <span class="va">k</span><span class="op">$</span><span class="va">cluster</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Note that because the first center is chosen at random, the final clusters are random. We impose some stability by repeating the entire function several times and averaging the results. The number of random starting values to use can be assigned through the <code>nstart</code> argument.</p>
 <div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-11_3c743c613ac2fc6027a318d6cd045c22">
-<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">k</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/kmeans.html">kmeans</a></span><span class="op">(</span><span class="va">x_0</span>, centers <span class="op">=</span> <span class="fl">10</span>, nstart <span class="op">=</span> <span class="fl">25</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">k</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/kmeans.html">kmeans</a></span><span class="op">(</span><span class="va">x_0</span>, centers <span class="op">=</span> <span class="fl">10</span>, nstart <span class="op">=</span> <span class="fl">25</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="heatmaps" class="level2" data-number="31.3"><h2 data-number="31.3" class="anchored" data-anchor-id="heatmaps">
-<span class="header-section-number">31.3</span> Heatmaps</h2>
-<p>A powerful visualization tool for discovering clusters or patterns in your data is the heatmap. The idea is simple: plot an image of your data matrix with colors used as the visual cue and both the columns and rows ordered according to the results of a clustering algorithm. We will demonstrate this with the <code>tissue_gene_expression</code> dataset. We will scale the rows of the gene expression matrix.</p>
-<p>The first step is compute:</p>
+</section><section id="heatmaps" class="level2" data-number="32.3"><h2 data-number="32.3" class="anchored" data-anchor-id="heatmaps">
+<span class="header-section-number">32.3</span> Heatmaps</h2>
+<p>A powerful visualization tool for discovering clusters or patterns in your data is the heatmap. The idea is simple: plot an image of your data matrix with colors used as the visual cue and both the columns and rows ordered according to the results of a clustering algorithm. We will demonstrate this with the <code>tissue_gene_expression</code> dataset.</p>
+<p>We start by scaling the columns of the gene expression matrix because we only care about relative differences in gene expression. After scaling, we compute perform clustering on both the observations and the predictors:</p>
 <div class="cell" data-layout-align="center" data-hash="clustering_cache/html/unnamed-chunk-12_70dddf90103204bbff1310de4041cf2d">
-<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">x</span>, <span class="fl">2</span>, <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colMeans</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span>
+<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">x</span>, <span class="fl">2</span>, <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colMeans</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">h_1</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/hclust.html">hclust</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">h_2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/hclust.html">hclust</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/dist.html">dist</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/t.html">t</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Now we can use the results of this clustering to order the rows and columns.</p>
+<p>Now we can use the results of this clustering to order the rows and columns:</p>
 <div class="cell" data-layout-align="center" data-hash="clustering_cache/html/heatmap_79debddaaa938f8acdb0dbf23ddb1f1e">
-<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span><span class="va">h_1</span><span class="op">$</span><span class="va">order</span>, <span class="va">h_2</span><span class="op">$</span><span class="va">order</span><span class="op">]</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span><span class="va">h_1</span><span class="op">$</span><span class="va">order</span>, <span class="va">h_2</span><span class="op">$</span><span class="va">order</span><span class="op">]</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>But there is <code>heatmap</code> function that does it for us:</p>
+<p>The <code>heatmap</code> function that does all this for us:</p>
 <div class="cell" data-layout-align="center" data-hash="clustering_cache/html/heatmap-2_9f077f3951f226596898f824621714eb">
-<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/heatmap.html">heatmap</a></span><span class="op">(</span><span class="va">x</span>, col <span class="op">=</span> <span class="fu">RColorBrewer</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/RColorBrewer/man/ColorBrewer.html">brewer.pal</a></span><span class="op">(</span><span class="fl">11</span>, <span class="st">"Spectral"</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/heatmap.html">heatmap</a></span><span class="op">(</span><span class="va">x</span>, col <span class="op">=</span> <span class="fu">RColorBrewer</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/RColorBrewer/man/ColorBrewer.html">brewer.pal</a></span><span class="op">(</span><span class="fl">11</span>, <span class="st">"Spectral"</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We do not show the results of the heatmap function because there are too many features for the plot to be useful. We will therefore filter some columns and remake the plots.</p>
-</section><section id="filtering-features" class="level2" data-number="31.4"><h2 data-number="31.4" class="anchored" data-anchor-id="filtering-features">
-<span class="header-section-number">31.4</span> Filtering features</h2>
-<p>If the information about clusters is included in just a few features, including all the features can add enough noise that detecting clusters becomes challenging. One simple approach to try to remove features with no information is to only include those with high variance. In the movie example, a user with low variance in their ratings is not really informative: all the movies seem about the same to them. Here is an example of how we can include only the features with high variance.</p>
+<p>Note we do not show the results of the heatmap function because there are too many features for the plot to be useful. We will therefore filter some columns and remake the plots.</p>
+<section id="filtering-features" class="level3" data-number="32.3.1"><h3 data-number="32.3.1" class="anchored" data-anchor-id="filtering-features">
+<span class="header-section-number">32.3.1</span> Filtering features</h3>
+<p>If only a few features are different between clusters, including all the features can add enough noise that making cluster detection challenging. A simple approach to avoid this is to assume low variability features are not informative and include only high variance features. For example, in the movie example, users with low variance in their ratings are not really distinguishing movies: all the movies seem about the same to them.</p>
+<p>Here is an example code showing how we can include only the features with high variance in a heatmap:</p>
 <div class="cell" data-layout-align="center" data-hash="clustering_cache/html/heatmap-3_895b4897cf2bc47726236a962dbf1df0">
-<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/HenrikBengtsson/matrixStats">matrixStats</a></span><span class="op">)</span></span>
+<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/HenrikBengtsson/matrixStats">matrixStats</a></span><span class="op">)</span></span>
 <span><span class="va">sds</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/rowSds.html">colSds</a></span><span class="op">(</span><span class="va">x</span>, na.rm <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
 <span><span class="va">o</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/order.html">order</a></span><span class="op">(</span><span class="va">sds</span>, decreasing <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="fl">25</span><span class="op">]</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/stats/heatmap.html">heatmap</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span>,<span class="va">o</span><span class="op">]</span>, col <span class="op">=</span> <span class="fu">RColorBrewer</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/RColorBrewer/man/ColorBrewer.html">brewer.pal</a></span><span class="op">(</span><span class="fl">11</span>, <span class="st">"Spectral"</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -539,8 +560,9 @@ <h1 class="title"><span id="sec-clustering" class="quarto-section-identifier"><s
 </div>
 </div>
 </div>
-</section><section id="exercises" class="level2" data-number="31.5"><h2 data-number="31.5" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">31.5</span> Exercises</h2>
+<p>Note there are several other heatmap functions in R. A popular example is the <code>heatmap.2</code> in the <strong>gplots</strong> package.</p>
+</section></section><section id="exercises" class="level2" data-number="32.4"><h2 data-number="32.4" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">32.4</span> Exercises</h2>
 <p>1. Load the <code>tissue_gene_expression</code> dataset. Remove the row means and compute the distance between each observation. Store the result in <code>d</code>.</p>
 <p>2. Make a hierarchical clustering plot and add the tissue types as labels.</p>
 <p>3. Run a k-means clustering on the data with <span class="math inline">\(K=7\)</span>. Make a table comparing the identified clusters to the actual tissue types. Run the algorithm several times to see how the answer changes.</p>
@@ -781,7 +803,7 @@ <h1 class="title"><span id="sec-clustering" class="quarto-section-identifier"><s
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../ml/ml-in-practice.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
diff --git a/docs/ml/clustering_files/figure-html/dendrogram-1.png b/docs/ml/clustering_files/figure-html/dendrogram-1.png
index c8dd02c..566826d 100644
Binary files a/docs/ml/clustering_files/figure-html/dendrogram-1.png and b/docs/ml/clustering_files/figure-html/dendrogram-1.png differ
diff --git a/docs/ml/clustering_files/figure-html/mnist-27-unsupervised-1.png b/docs/ml/clustering_files/figure-html/mnist-27-unsupervised-1.png
deleted file mode 100644
index deb91b4..0000000
Binary files a/docs/ml/clustering_files/figure-html/mnist-27-unsupervised-1.png and /dev/null differ
diff --git a/docs/ml/conditionals.html b/docs/ml/conditionals.html
index 9f2b0e1..aa5a973 100644
--- a/docs/ml/conditionals.html
+++ b/docs/ml/conditionals.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 26&nbsp; Conditional probabilities and expectations</title>
+<title>Advanced Data Science - 27&nbsp; Conditional probabilities and expectations</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/conditionals.html"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/conditionals.html"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,16 +405,16 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#conditional-probabilities" id="toc-conditional-probabilities" class="nav-link active" data-scroll-target="#conditional-probabilities"><span class="header-section-number">26.1</span> Conditional probabilities</a></li>
-  <li><a href="#conditional-expectations" id="toc-conditional-expectations" class="nav-link" data-scroll-target="#conditional-expectations"><span class="header-section-number">26.2</span> Conditional expectations</a></li>
-  <li><a href="#conditional-expectation-minimizes-squared-loss-function" id="toc-conditional-expectation-minimizes-squared-loss-function" class="nav-link" data-scroll-target="#conditional-expectation-minimizes-squared-loss-function"><span class="header-section-number">26.3</span> Conditional expectation minimizes squared loss function</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">26.4</span> Exercises</a></li>
+<li><a href="#conditional-probabilities" id="toc-conditional-probabilities" class="nav-link active" data-scroll-target="#conditional-probabilities"><span class="header-section-number">27.1</span> Conditional probabilities</a></li>
+  <li><a href="#conditional-expectations" id="toc-conditional-expectations" class="nav-link" data-scroll-target="#conditional-expectations"><span class="header-section-number">27.2</span> Conditional expectations</a></li>
+  <li><a href="#conditional-expectations-minimizes-squared-loss-function" id="toc-conditional-expectations-minimizes-squared-loss-function" class="nav-link" data-scroll-target="#conditional-expectations-minimizes-squared-loss-function"><span class="header-section-number">27.3</span> Conditional expectations minimizes squared loss function</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">27.4</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/ml/conditionals.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
 <h1 class="title">
-<span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span>
+<span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span>
 </h1>
 </div>
 
@@ -422,26 +428,37 @@ <h1 class="title">
   </div>
   
 
-</header><p>In machine learning applications, we rarely can predict outcomes perfectly. For example, spam detectors often miss emails that are clearly spam, Siri often misunderstands the words we are saying, and your bank at times thinks your card was stolen when it was not. The most common reason for not being able to build perfect algorithms is that it is impossible. To see this, note that most datasets will include groups of observations with the same exact observed values for all predictors, but with different outcomes. Because our prediction rules are functions, equal inputs (the predictors) implies equal outputs (the predictions). Therefore, for a challenge in which the same predictors are associated with different outcomes across different individual observations, it is impossible to predict correctly for all these cases. We saw a simple example of this in the previous section: for any given height <span class="math inline">\(x\)</span>, you will have both males and females that are <span class="math inline">\(x\)</span> inches tall.</p>
-<p>However, none of this means that we can’t build useful algorithms that are much better than guessing, and in some cases better than expert opinions. To achieve this in an optimal way, we make use of probabilistic representations of the problem based on the ideas presented in Section <a href="../linear-models/regression.html#sec-conditional-expectation"><span>Section&nbsp;13.3</span></a>. Observations with the same observed values for the predictors may not all be the same, but we can assume that they all have the same probability of this class or that class. We will write this idea out mathematically for the case of categorical data.</p>
-<section id="conditional-probabilities" class="level2" data-number="26.1"><h2 data-number="26.1" class="anchored" data-anchor-id="conditional-probabilities">
-<span class="header-section-number">26.1</span> Conditional probabilities</h2>
-<p>We use the notation <span class="math inline">\((X_1 = x_1,\dots,X_p=x_p)\)</span> to represent the fact that we have observed values <span class="math inline">\(x_1,\dots,x_p\)</span> for covariates <span class="math inline">\(X_1, \dots, X_p\)</span>. This does not imply that the outcome <span class="math inline">\(Y\)</span> will take a specific value. Instead, it implies a specific probability. In particular, we denote the <em>conditional probabilities</em> for each class <span class="math inline">\(k\)</span>:</p>
+</header><p>In machine learning applications, we rarely can predict outcomes perfectly. For example, spam detectors often miss emails that are clearly spam, Siri often misunderstands the words we are saying, and sometimes your bank thinks your card was stolen when it was not. The most common reason for not being able to build perfect algorithms is that it is impossible. To see this, consider that most datasets will include groups of observations with the same exact observed values for all predictors, but with different outcomes.</p>
+<p>Because our prediction rules are functions, equal inputs (the predictors) implies equal outputs (the predictions). Therefore, for a challenge in which the same predictors are associated with different outcomes across different individual observations, it is impossible to predict correctly for all these cases. We saw a simple example of this in the previous section: for any given height <span class="math inline">\(x\)</span>, you will have both males and females that are <span class="math inline">\(x\)</span> inches tall.</p>
+<p>However, none of this means that we can’t build useful algorithms that are much better than guessing, and in some cases better than expert opinions. To achieve this in an optimal way, we make use of probabilistic representations of the problem based on the ideas presented in <a href="../linear-models/regression.html#sec-conditional-expectation"><span>Section&nbsp;14.3</span></a>. Observations with the same observed values for the predictors may not all be the same, but we can assume that they all have the same probability of this class or that class. We will write this idea out mathematically for the case of categorical data.</p>
+<section id="conditional-probabilities" class="level2" data-number="27.1"><h2 data-number="27.1" class="anchored" data-anchor-id="conditional-probabilities">
+<span class="header-section-number">27.1</span> Conditional probabilities</h2>
+<p>We use the notation <span class="math inline">\((X_1 = x_1,\dots,X_p=x_p)\)</span> to represent the fact that we have observed values <span class="math inline">\(x_1,\dots,x_p\)</span> for covariates <span class="math inline">\(X_1, \dots, X_p\)</span>. This does not imply that the outcome <span class="math inline">\(Y\)</span> will take a specific value. Instead, it implies a specific probability. In particular, we denote the <em>conditional probabilities</em> for each class <span class="math inline">\(k\)</span> with:</p>
 <p><span class="math display">\[
 \mbox{Pr}(Y=k \mid X_1 = x_1,\dots,X_p=x_p), \, \mbox{for}\,k=1,\dots,K
 \]</span></p>
-<p>To avoid writing out all the predictors, we will use the bold letters like this: <span class="math inline">\(\mathbf{X} \equiv (X_1,\dots,X_p)\)</span> and <span class="math inline">\(\mathbf{x} \equiv (x_1,\dots,x_p)\)</span>. We will also use the following notation for the conditional probability of being class <span class="math inline">\(k\)</span>:</p>
+<p>To avoid writing out all the predictors, we will use the bold letters like this: <span class="math inline">\(\mathbf{X} \equiv (X_1,\dots,X_p)^\top\)</span> and <span class="math inline">\(\mathbf{x} \equiv (x_1,\dots,x_p)^\top\)</span>. We will also use the following notation for the conditional probability of being class <span class="math inline">\(k\)</span>:</p>
 <p><span class="math display">\[
 p_k(\mathbf{x}) = \mbox{Pr}(Y=k \mid \mathbf{X}=\mathbf{x}), \, \mbox{for}\, k=1,\dots,K
-\]</span></p>
-<p>Note: We will be using the <span class="math inline">\(p(x)\)</span> notation to represent conditional probabilities as functions of the predictors. Do not confuse it with the <span class="math inline">\(p\)</span> that represents the number of predictors.</p>
-<p>These probabilities guide the construction of an algorithm that makes the best prediction: for any given <span class="math inline">\(\mathbf{x}\)</span>, we will predict the class <span class="math inline">\(k\)</span> with the largest probability among <span class="math inline">\(p_1(x), p_2(x), \dots p_K(x)\)</span>. In mathematical notation, we write it like this: <span class="math inline">\(\hat{Y} = \max_k p_k(\mathbf{x})\)</span>.</p>
-<p>In machine learning, we refer to this as <em>Bayes’ Rule</em>. But keep in mind that this is a theoretical rule since in practice we don’t know <span class="math inline">\(p_k(\mathbf{x}), k=1,\dots,K\)</span>. In fact, estimating these conditional probabilities can be thought of as the main challenge of machine learning. The better our probability estimates <span class="math inline">\(\hat{p}_k(\mathbf{x})\)</span>, the better our predictor:</p>
-<p><span class="math display">\[\hat{Y} = \max_k \hat{p}_k(\mathbf{x})\]</span></p>
-<p>So what we will predict depends on two things: 1) how close are the <span class="math inline">\(\max_k p_k(\mathbf{x})\)</span> to 1 or 0 (perfect certainty) and 2) how close our estimates <span class="math inline">\(\hat{p}_k(\mathbf{x})\)</span> are to <span class="math inline">\(p_k(\mathbf{x})\)</span>. We can’t do anything about the first restriction as it is determined by the nature of the problem, so our energy goes into finding ways to best estimate conditional probabilities. The first restriction does imply that we have limits as to how well even the best possible algorithm can perform. You should get used to the idea that while in some challenges we will be able to achieve almost perfect accuracy, with digit readers for example, in others our success is restricted by the randomness of the process, with movie recommendations for example.</p>
-<p>Before we continue, it is important to remember that defining our prediction by maximizing the probability is not always optimal in practice and depends on the context. As discussed above, sensitivity and specificity may differ in importance. But even in these cases, having a good estimate of the <span class="math inline">\(p_k(x), k=1,\dots,K\)</span> will suffice for us to build optimal prediction models, since we can control the balance between specificity and sensitivity however we wish. For instance, we can simply change the cutoffs used to predict one outcome or the other. In the plane example, we may ground the plane anytime the probability of malfunction is higher than 1 in a million as opposed to the default 1/2 used when error types are equally undesired.</p>
-</section><section id="conditional-expectations" class="level2" data-number="26.2"><h2 data-number="26.2" class="anchored" data-anchor-id="conditional-expectations">
-<span class="header-section-number">26.2</span> Conditional expectations</h2>
+\]</span> Notice that the <span class="math inline">\(p_k(\mathbf{x})\)</span> have to add up to 1 for each <span class="math inline">\(\mathbf{x}\)</span>, so once we know <span class="math inline">\(K-1\)</span>, we know all. When the outcome is binary, we only need to know 1, so we drop the <span class="math inline">\(k\)</span> and use the notation <span class="math inline">\(p(\mathbf{x}) = \mbox{Pr}(Y=1 \mid \mathbf{X}=\mathbf{x})\)</span>.</p>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>Do not be confused by the fact that we use <span class="math inline">\(p\)</span> for two different things: the conditional probability <span class="math inline">\(p(\mathbf{x})\)</span> and the number of predictors <span class="math inline">\(p\)</span>.</p>
+</div>
+</div>
+</div>
+<p>These probabilities guide the construction of an algorithm that makes the best prediction: for any given <span class="math inline">\(\mathbf{x}\)</span>, we will predict the class <span class="math inline">\(k\)</span> with the largest probability among <span class="math inline">\(p_1(x), p_2(x), \dots p_K(x)\)</span>. In mathematical notation, we write it like this:</p>
+<p><span class="math display">\[\hat{Y} = \max_k p_k(\mathbf{x})\]</span></p>
+<p>In machine learning, we refer to this as <em>Bayes’ Rule</em>. But this is a theoretical rule since, in practice, we don’t know <span class="math inline">\(p_k(\mathbf{x}), k=1,\dots,K\)</span>. In fact, estimating these conditional probabilities can be thought of as the main challenge of machine learning. The better our probability estimates <span class="math inline">\(\hat{p}_k(\mathbf{x})\)</span>, the better our predictor <span class="math inline">\(\hat{Y}\)</span>.</p>
+<p>So how well we predict depends on two things: 1) how close are the <span class="math inline">\(\max_k p_k(\mathbf{x})\)</span> to 1 or 0 (perfect certainty) and 2) how close our estimates <span class="math inline">\(\hat{p}_k(\mathbf{x})\)</span> are to <span class="math inline">\(p_k(\mathbf{x})\)</span>. We can’t do anything about the first restriction as it is determined by the nature of the problem, so our energy goes into finding ways to best estimate conditional probabilities.</p>
+<p>The first restriction does imply that we have limits as to how well even the best possible algorithm can perform. You should get used to the idea that while in some challenges we will be able to achieve almost perfect accuracy, with digit readers for example, in others, our success is restricted by the randomness of the process, such as with movie recommendations.</p>
+<p>Keep in mind that defining our prediction by maximizing the probability is not always optimal in practice and depends on the context. As discussed in <a href="evaluation-metrics.html"><span>Chapter&nbsp;26</span></a>, sensitivity and specificity may differ in importance. But even in these cases, having a good estimate of the <span class="math inline">\(p_k(x), k=1,\dots,K\)</span> will suffice for us to build optimal prediction models, since we can control the balance between specificity and sensitivity however we wish. For instance, we can simply change the cutoffs used to predict one outcome or the other. In the plane example, we may ground the plane anytime the probability of malfunction is higher than 1 in a million as opposed to the default 1/2 used when error types are equally undesired.</p>
+</section><section id="conditional-expectations" class="level2" data-number="27.2"><h2 data-number="27.2" class="anchored" data-anchor-id="conditional-expectations">
+<span class="header-section-number">27.2</span> Conditional expectations</h2>
 <p>For binary data, you can think of the probability <span class="math inline">\(\mbox{Pr}(Y=1 \mid \mathbf{X}=\mathbf{x})\)</span> as the proportion of 1s in the stratum of the population for which <span class="math inline">\(\mathbf{X}=\mathbf{x}\)</span>. Many of the algorithms we will learn can be applied to both categorical and continuous data due to the connection between <em>conditional probabilities</em> and <em>conditional expectations</em>.</p>
 <p>Because the expectation is the average of values <span class="math inline">\(y_1,\dots,y_n\)</span> in the population, in the case in which the <span class="math inline">\(y\)</span>s are 0 or 1, the expectation is equivalent to the probability of randomly picking a one since the average is simply the proportion of ones:</p>
 <p><span class="math display">\[
@@ -449,8 +466,8 @@ <h1 class="title">
 \]</span></p>
 <p>As a result, we often only use the expectation to denote both the conditional probability and conditional expectation.</p>
 <p>Just like with categorical outcomes, in most applications the same observed predictors do not guarantee the same continuous outcomes. Instead, we assume that the outcome follows the same conditional distribution. We will now explain why we use the conditional expectation to define our predictors.</p>
-</section><section id="conditional-expectation-minimizes-squared-loss-function" class="level2" data-number="26.3"><h2 data-number="26.3" class="anchored" data-anchor-id="conditional-expectation-minimizes-squared-loss-function">
-<span class="header-section-number">26.3</span> Conditional expectation minimizes squared loss function</h2>
+</section><section id="conditional-expectations-minimizes-squared-loss-function" class="level2" data-number="27.3"><h2 data-number="27.3" class="anchored" data-anchor-id="conditional-expectations-minimizes-squared-loss-function">
+<span class="header-section-number">27.3</span> Conditional expectations minimizes squared loss function</h2>
 <p>Why do we care about the conditional expectation in machine learning? This is because the expected value has an attractive mathematical property: it minimizes the MSE. Specifically, of all possible predictions <span class="math inline">\(\hat{Y}\)</span>,</p>
 <p><span class="math display">\[
 \hat{Y} = \mbox{E}(Y \mid \mathbf{X}=\mathbf{x}) \, \mbox{ minimizes } \, \mbox{E}\{ (\hat{Y} - Y)^2  \mid  \mathbf{X}=\mathbf{x} \}
@@ -459,11 +476,13 @@ <h1 class="title">
 <p><span class="math display">\[
 f(\mathbf{x}) \equiv \mbox{E}( Y  \mid  \mathbf{X}=\mathbf{x} )
 \]</span></p>
-<p>for any set of features <span class="math inline">\(\mathbf{x} = (x_1, \dots, x_p)\)</span>. Of course this is easier said than done, since this function can take any shape and <span class="math inline">\(p\)</span> can be very large. Consider a case in which we only have one predictor <span class="math inline">\(x\)</span>. The expectation <span class="math inline">\(\mbox{E}\{ Y \mid X=x \}\)</span> can be any function of <span class="math inline">\(x\)</span>: a line, a parabola, a sine wave, a step function, anything. It gets even more complicated when we consider instances with large <span class="math inline">\(p\)</span>, in which case <span class="math inline">\(f(\mathbf{x})\)</span> is a function of a multidimensional vector <span class="math inline">\(\mathbf{x}\)</span>. For example, in our digit reader example <span class="math inline">\(p = 784\)</span>! <strong>The main way in which competing machine learning algorithms differ is in their approach to estimating this expectation.</strong></p>
-</section><section id="exercises" class="level2" data-number="26.4"><h2 data-number="26.4" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">26.4</span> Exercises</h2>
+<p>for any set of features <span class="math inline">\(\mathbf{x} = (x_1, \dots, x_p)^\top\)</span>.</p>
+<p>This is easier said than done, since this function can take any shape and <span class="math inline">\(p\)</span> can be very large. Consider a case in which we only have one predictor <span class="math inline">\(x\)</span>. The expectation <span class="math inline">\(\mbox{E}\{ Y \mid X=x \}\)</span> can be any function of <span class="math inline">\(x\)</span>: a line, a parabola, a sine wave, a step function, anything. It gets even more complicated when we consider instances with large <span class="math inline">\(p\)</span>, in which case <span class="math inline">\(f(\mathbf{x})\)</span> is a function of a multidimensional vector <span class="math inline">\(\mathbf{x}\)</span>. For example, in our digit reader example <span class="math inline">\(p = 784\)</span>!</p>
+<p>The main way in which competing machine learning algorithms differ is in their approach to estimating this conditional expectation.</p>
+</section><section id="exercises" class="level2" data-number="27.4"><h2 data-number="27.4" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">27.4</span> Exercises</h2>
 <p>1. Compute conditional probabilities for being Male for the <code>heights</code> dataset. Round the heights to the closest inch. Plot the estimated conditional probability <span class="math inline">\(P(x) = \mbox{Pr}(\mbox{Male} | \mbox{height}=x)\)</span> for each <span class="math inline">\(x\)</span>.</p>
-<p>2. In the plot we just made, we see high variability for low values of height. This is because we have few data points in these strata. This time use the <code>quantile</code> function for quantiles <span class="math inline">\(0.1,0.2,\dots,0.9\)</span> and the <code>cut</code> function to assure each group has the same number of points. Hint: for any numeric vector <code>x</code>, you can create groups based on quantiles like this:</p>
+<p>2. In the plot we just made, we see high variability for low values of height. This is because we have few data points in these strata. This time use the <code>quantile</code> function for quantiles <span class="math inline">\(0.1,0.2,\dots,0.9\)</span> and the <code>cut</code> function to assure each group has the same number of points. Hint: For any numeric vector <code>x</code>, you can create groups based on quantiles as we demonstrate below.</p>
 <div class="cell" data-layout-align="center" data-hash="conditionals_cache/html/unnamed-chunk-1_c859139d2e1c3eff338e903c831695b1">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/cut.html">cut</a></span><span class="op">(</span><span class="va">x</span>, <span class="fu"><a href="https://rdrr.io/r/stats/quantile.html">quantile</a></span><span class="op">(</span><span class="va">x</span>, <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span>, <span class="fl">0.1</span><span class="op">)</span><span class="op">)</span>, include.lowest <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
@@ -710,12 +729,12 @@ <h1 class="title">
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../ml/evaluation-metrics.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../ml/smoothing.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/ml/cross-validation.html b/docs/ml/cross-validation.html
index 362472c..8dabd86 100644
--- a/docs/ml/cross-validation.html
+++ b/docs/ml/cross-validation.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 28&nbsp; Cross validation</title>
+<title>Advanced Data Science - 29&nbsp; Resampling methods</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/cross-validation.html"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/cross-validation.html"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,19 +405,31 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#sec-knn-cv-intro" id="toc-sec-knn-cv-intro" class="nav-link active" data-scroll-target="#sec-knn-cv-intro"><span class="header-section-number">28.1</span> Motivation with k-nearest neighbors</a></li>
-  <li><a href="#over-training" id="toc-over-training" class="nav-link" data-scroll-target="#over-training"><span class="header-section-number">28.2</span> Over-training</a></li>
-  <li><a href="#over-smoothing" id="toc-over-smoothing" class="nav-link" data-scroll-target="#over-smoothing"><span class="header-section-number">28.3</span> Over-smoothing</a></li>
-  <li><a href="#picking-the-k-in-knn" id="toc-picking-the-k-in-knn" class="nav-link" data-scroll-target="#picking-the-k-in-knn"><span class="header-section-number">28.4</span> Picking the <span class="math inline">\(k\)</span> in kNN</a></li>
-  <li><a href="#mathematical-description-of-cross-validation" id="toc-mathematical-description-of-cross-validation" class="nav-link" data-scroll-target="#mathematical-description-of-cross-validation"><span class="header-section-number">28.5</span> Mathematical description of cross validation</a></li>
-  <li><a href="#k-fold-cross-validation" id="toc-k-fold-cross-validation" class="nav-link" data-scroll-target="#k-fold-cross-validation"><span class="header-section-number">28.6</span> K-fold cross validation</a></li>
-  <li><a href="#bootstrap" id="toc-bootstrap" class="nav-link" data-scroll-target="#bootstrap"><span class="header-section-number">28.7</span> Bootstrap</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">28.8</span> Exercises</a></li>
+<li><a href="#sec-knn-cv-intro" id="toc-sec-knn-cv-intro" class="nav-link active" data-scroll-target="#sec-knn-cv-intro"><span class="header-section-number">29.1</span> Motivation with k-nearest neighbors</a></li>
+  <li><a href="#over-training" id="toc-over-training" class="nav-link" data-scroll-target="#over-training"><span class="header-section-number">29.2</span> Over-training</a></li>
+  <li><a href="#over-smoothing" id="toc-over-smoothing" class="nav-link" data-scroll-target="#over-smoothing"><span class="header-section-number">29.3</span> Over-smoothing</a></li>
+  <li><a href="#parameter-tuning" id="toc-parameter-tuning" class="nav-link" data-scroll-target="#parameter-tuning"><span class="header-section-number">29.4</span> Parameter tuning</a></li>
+  <li><a href="#mathematical-description-of-resampling-methods" id="toc-mathematical-description-of-resampling-methods" class="nav-link" data-scroll-target="#mathematical-description-of-resampling-methods"><span class="header-section-number">29.5</span> Mathematical description of resampling methods</a></li>
+  <li>
+<a href="#cross-validation" id="toc-cross-validation" class="nav-link" data-scroll-target="#cross-validation"><span class="header-section-number">29.6</span> Cross validation</a>
+  <ul class="collapse">
+<li><a href="#k-fold-cross-validation" id="toc-k-fold-cross-validation" class="nav-link" data-scroll-target="#k-fold-cross-validation"><span class="header-section-number">29.6.1</span> K-fold cross validation</a></li>
+  <li><a href="#how-many-folds" id="toc-how-many-folds" class="nav-link" data-scroll-target="#how-many-folds"><span class="header-section-number">29.6.2</span> How many folds?</a></li>
+  <li><a href="#estimate-mse-of-our-optimized-algorithm" id="toc-estimate-mse-of-our-optimized-algorithm" class="nav-link" data-scroll-target="#estimate-mse-of-our-optimized-algorithm"><span class="header-section-number">29.6.3</span> Estimate MSE of our optimized algorithm</a></li>
+  </ul>
+</li>
+  <li>
+<a href="#boostrap-resampling" id="toc-boostrap-resampling" class="nav-link" data-scroll-target="#boostrap-resampling"><span class="header-section-number">29.7</span> Boostrap resampling</a>
+  <ul class="collapse">
+<li><a href="#sec-mse-estimates" id="toc-sec-mse-estimates" class="nav-link" data-scroll-target="#sec-mse-estimates"><span class="header-section-number">29.7.1</span> Comparison of MSE estimates</a></li>
+  </ul>
+</li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">29.8</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/ml/cross-validation.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
-<h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifier"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></h1>
+<h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifier"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></h1>
 </div>
 
 
@@ -424,50 +442,31 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
   </div>
   
 
-</header><p>In this chapter we introduce cross validation, one of the most important ideas in machine learning. Here we focus on the conceptual and mathematical aspects. We will describe how to implement cross validation in practice with the <strong>caret</strong> package later, in Section <a href="ml-in-practice.html#sec-caret-cv"><span>Section&nbsp;30.1.2</span></a>) in the next chapter. To motivate the concept, we will use the two predictor digits data presented in Section @ref(two-or-seven) and introduce, for the first time, an actual machine learning algorithm: k-nearest neighbors (kNN.</p>
-<section id="sec-knn-cv-intro" class="level2" data-number="28.1"><h2 data-number="28.1" class="anchored" data-anchor-id="sec-knn-cv-intro">
-<span class="header-section-number">28.1</span> Motivation with k-nearest neighbors</h2>
-<p>Let’s start by loading the data and showing a plot of the predictors with outcome represented with color.</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/mnist-27-data_f981ed052ea9602852f31e538936f5c6">
-<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
-<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
-<span><span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span>, color <span class="op">=</span> <span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="cross-validation_files/figure-html/mnist-27-data-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>We will use these data to estimate the conditional probability function</p>
+</header><p>In this chapter, we introduce resampling, one of the most important ideas in machine learning. Here we focus on the conceptual and mathematical aspects. We will describe how to implement resampling methods in practice with the <strong>caret</strong> package later in <a href="ml-in-practice.html#sec-caret-cv"><span>Section&nbsp;31.1.3</span></a>. To motivate the concept, we will use the two predictor digits data presented in @ref-two-or-seven and introduce k-nearest neighbors (kNN), to demonstrate the ideas.</p>
+<section id="sec-knn-cv-intro" class="level2" data-number="29.1"><h2 data-number="29.1" class="anchored" data-anchor-id="sec-knn-cv-intro">
+<span class="header-section-number">29.1</span> Motivation with k-nearest neighbors</h2>
+<p>We are interested in estimating the conditional probability function:</p>
 <p><span class="math display">\[
-p(x_1, x_2) = \mbox{Pr}(Y = 1 \mid X_1 = x_1 , X_2 = x_2).
-\]</span> as defined in Section <a href="smoothing.html#sec-smoothing-ml-connection"><span>Section&nbsp;27.6</span></a>). With k-nearest neighbors (kNN) we estimate <span class="math inline">\(p(x_1, x_2)\)</span> in a similar way to bin smoothing. However, as we will see, kNN is easier to adapt to multiple dimensions. First we define the distance between all observations based on the features. Then, for any point <span class="math inline">\((x_1,x_2)\)</span> for which we want an estimate of <span class="math inline">\(p(x_1, x_2)\)</span>, we look for the <span class="math inline">\(k\)</span> nearest points to <span class="math inline">\((x_1,x_2)\)</span> and then take an average of the 0s and 1s associated with these points. We refer to the set of points used to compute the average as the <em>neighborhood</em>. Due to the connection we described earlier between conditional expectations and conditional probabilities, this gives us a <span class="math inline">\(\hat{p}(x_1,x_2\)</span>, just like the bin smoother gave us an estimate of a trend. As with bin smoothers, we can control the flexibility of our estimate, in this case through the <span class="math inline">\(k\)</span> parameter: larger <span class="math inline">\(k\)</span>s result in smoother estimates, while smaller <span class="math inline">\(k\)</span>s result in more flexible and more wiggly estimates.</p>
-<p>To implement the algorithm, we can use the <code>knn3</code> function from the <strong>caret</strong> package. Looking at the help file for this package, we see that we can call it in one of two ways. We will use the first in which we specify a <em>formula</em> and a data frame. The data frame contains all the data to be used. The formula has the form <code>outcome ~ predictor_1 + predictor_2 + predictor_3</code> and so on. Therefore, we would type <code>y ~ x_1 + x_2</code>. If we are going to use all the predictors, we can use the <code>.</code> like this <code>y ~ .</code>. For this function, we also need to pick a parameter: the number of neighbors to include. Let’s start with the default <span class="math inline">\(k = 5\)</span>. The final call looks like this:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-2_0b6435734badd88a323de8256e03be37">
-<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span>
+p(\mathbf{x}) = \mbox{Pr}(Y = 1 \mid X_1 = x_1 , X_2 = x_2).
+\]</span></p>
+<p>as defined in <a href="smoothing.html#sec-smoothing-ml-connection"><span>Section&nbsp;28.6</span></a>.</p>
+<p>With k-nearest neighbors (kNN) we estimate <span class="math inline">\(p(\mathbf{x})\)</span> in a similar way to bin smoothing. First, we define the distance between all observations based on the features. Then, for any point <span class="math inline">\(\mathbf{x}_0\)</span>, we estimate <span class="math inline">\(p(\mathbf{x})\)</span> by identifying the <span class="math inline">\(k\)</span> nearest points to <span class="math inline">\(mathbf{x}_0\)</span> and afterwards taking an average of the <span class="math inline">\(y\)</span>s associated with these points. We refer to the set of points used to compute the average as the <em>neighborhood</em>.</p>
+<p>Due to the connection we described earlier between conditional expectations and conditional probabilities, this gives us <span class="math inline">\(\hat{p}(\mathbf{x}_0)\)</span>, just like the bin smoother gave us an estimate of a trend. As with bin smoothers, we can control the flexibility of our estimate through the <span class="math inline">\(k\)</span> parameter: larger <span class="math inline">\(k\)</span>s result in smoother estimates, while smaller <span class="math inline">\(k\)</span>s result in more flexible and wiggly estimates.</p>
+<p>To implement the algorithm, we can use the <code>knn3</code> function from the <strong>caret</strong> package. Looking at the help file for this package, we see that we can call it in one of two ways. We will use the first way in which we specify a <em>formula</em> and a data frame. The data frame contains all the data to be used. The formula has the form <code>outcome ~ predictor_1 + predictor_2 + predictor_3</code> and so on. Therefore, we type <code>y ~ x_1 + x_2</code>. If we are going to use variables in the data frame, we can use the <code>.</code> like this <code>y ~ .</code>. We also need to pick <span class="math inline">\(k\)</span>, which is set to <code>k = 5</code> by default. The final call looks like this:</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-1_c29f995814a575beb1f0f1b4286f92cd">
+<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
+<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span>
 <span><span class="va">knn_fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/knn3.html">knn3</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>, k <span class="op">=</span> <span class="fl">5</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>In this case, since our dataset is balanced and we care just as much about sensitivity as we do about specificity, we will use accuracy to quantify performance.</p>
-<p>The <code>predict</code> function for <code>knn</code> produces a probability for each class. We keep the probability of being a 7 as the estimate <span class="math inline">\(\hat{p}(x_1, x_2)\)</span></p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-3_0e584e161bec59f20481e11de1adb509">
-<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">knn_fit</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
+<p>The <code>predict</code> function for <code>knn3</code> produces a probability for each class. We can keep the probability of being a 7 as the estimate <span class="math inline">\(\hat{p}(\mathbf{x})\)</span> using <code>type = "prob"</code>. Here we obtain the actual prediction using <code>type = "class"</code>:</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-2_b90a7e7cb975dd5c24af1266e3901301">
+<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">knn_fit</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat_knn</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
 <span><span class="co">#&gt; Accuracy </span></span>
 <span><span class="co">#&gt;    0.815</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>In Section <a href="smoothing.html#sec-two-or-seven"><span>Section&nbsp;27.1</span></a> we used linear regression to generate an estimate.</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-4_529b7d59918b4998683c6f4c6196dc14">
-<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit_lm</span> <span class="op">&lt;-</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>y <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">y</span> <span class="op">==</span> <span class="fl">7</span>, <span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">x_1</span> <span class="op">+</span> <span class="va">x_2</span>, data <span class="op">=</span> <span class="va">_</span><span class="op">)</span></span>
-<span><span class="va">p_hat_lm</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit_lm</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">)</span></span>
-<span><span class="va">y_hat_lm</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">p_hat_lm</span> <span class="op">&gt;</span> <span class="fl">0.5</span>, <span class="fl">7</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat_lm</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
-<span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;     0.75</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>And we see that kNN, with the default parameter, already beats regression. To see why this is the case, we will plot <span class="math inline">\(\hat{p}(x_1, x_2)\)</span> and compare it to the true conditional probability <span class="math inline">\(p(x_1, x_2)\)</span>:</p>
+<p>We see that kNN, with the default parameter, already beats regression. To see why this is the case, we plot <span class="math inline">\(\hat{p}(\mathbf{x})\)</span> and compare it to the true conditional probability <span class="math inline">\(p(\mathbf{x})\)</span>:</p>
 <div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/knn-fit_770e246744713c849f007372fff9cce6">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -476,36 +475,37 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 </div>
 </div>
 </div>
-<p>We see that kNN better adapts to the non-linear shape of <span class="math inline">\(p(x_1, x_2)\)</span>. However, our estimate has some islands of blue in the red area, which intuitively does not make much sense. This is due to what we call <em>over-training</em>. We describe over-training in detail below. Over-training is the reason that we have higher accuracy in the train set compared to the test set:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-6_533bfc2eb6e5d1518148608b3e975ebb">
-<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">knn_fit</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
+<p>We see that kNN better adapts to the non-linear shape of <span class="math inline">\(p(\mathbf{x})\)</span>. However, our estimate has some islands of blue in the red area, which intuitively does not make much sense. We notice that we have higher accuracy in the train set compared to the test set:</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-4_0320d9d4dc75072915b93e6de858699b">
+<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">knn_fit</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat_knn</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
 <span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;    0.882</span></span>
+<span><span class="co">#&gt;    0.858</span></span>
 <span></span>
 <span><span class="va">y_hat_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">knn_fit</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat_knn</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
 <span><span class="co">#&gt; Accuracy </span></span>
 <span><span class="co">#&gt;    0.815</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="over-training" class="level2" data-number="28.2"><h2 data-number="28.2" class="anchored" data-anchor-id="over-training">
-<span class="header-section-number">28.2</span> Over-training</h2>
-<p>Over-training is at its worst when we set <span class="math inline">\(k = 1\)</span>. With <span class="math inline">\(k = 1\)</span>, the estimate for each <span class="math inline">\((x_1, x_2)\)</span> in the training set is obtained with just the <span class="math inline">\(y\)</span> corresponding to that point. In this case, if the <span class="math inline">\((x_1, x_2)\)</span> are unique, we will obtain perfect accuracy in the training set because each point is used to predict itself. Remember that if the predictors are not unique and have different outcomes for at least one set of predictors, then it is impossible to predict perfectly.</p>
-<p>Here we fit a kNN model with <span class="math inline">\(k = 1\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-7_74d967859ed8bb92aa7be3ca3a51887c">
-<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">knn_fit_1</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/knn3.html">knn3</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>, k <span class="op">=</span> <span class="fl">1</span><span class="op">)</span></span>
+<p>This is due to what we call <em>over-training</em>.</p>
+</section><section id="over-training" class="level2" data-number="29.2"><h2 data-number="29.2" class="anchored" data-anchor-id="over-training">
+<span class="header-section-number">29.2</span> Over-training</h2>
+<p>With kNN, over-training is at its worst when we set <span class="math inline">\(k = 1\)</span>. With <span class="math inline">\(k = 1\)</span>, the estimate for each <span class="math inline">\(\mathbf{x}\)</span> in the training set is obtained with just the <span class="math inline">\(y\)</span> corresponding to that point. In this case, if the <span class="math inline">\(x_1\)</span> and <span class="math inline">\(x_2\)</span> are unique, we will obtain perfect accuracy in the training set because each point is used to predict itself (if the predictors are not unique and have different outcomes for at least one set of predictors, then it is impossible to predict perfectly).</p>
+<p>Here we fit a kNN model with <span class="math inline">\(k = 1\)</span> and confirm we get nearer to perfect accuracy in the training set:</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-5_c8415b1bdd6fc3984ebf0f03f195621f">
+<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">knn_fit_1</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/knn3.html">knn3</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>, k <span class="op">=</span> <span class="fl">1</span><span class="op">)</span></span>
 <span><span class="va">y_hat_knn_1</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">knn_fit_1</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat_knn_1</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[[</span><span class="st">"Accuracy"</span><span class="op">]</span><span class="op">]</span></span>
-<span><span class="co">#&gt; [1] 0.996</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt; [1] 0.991</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>However, the test set accuracy is actually worse than regression:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-8_cc7431aadbfd667697ad7120b7d8c14b">
-<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_knn_1</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">knn_fit_1</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
+<p>But in the test set, accuracy is actually worse than what we obtained with regression:</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-6_99410ce136690259937c692884532cb6">
+<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_knn_1</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">knn_fit_1</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat_knn_1</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
 <span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;     0.73</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt;     0.81</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can see the over-fitting problem in this figure.</p>
+<p>We can see the over-fitting problem by plotting the decision rule boundaries produced by <span class="math inline">\(p(\mathbf{x})\)</span>:</p>
 <div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/knn-1-overfit_76ed8981a6dd5f18e6f3aa89d8083e1f">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -514,32 +514,19 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 </div>
 </div>
 </div>
-<p>The black curves denote the decision rule boundaries.</p>
-<p>The estimate <span class="math inline">\(\hat{p}(x_1, x_2)\)</span> follows the training data too closely (left). You can see that in the training set, boundaries have been drawn to perfectly surround a single red point in a sea of blue. Because most points <span class="math inline">\((x_1, x_2)\)</span> are unique, the prediction is either 1 or 0 and the prediction for that point is the associated label. However, once we introduce the training set (right), we see that many of these small islands now have the opposite color and we end up making several incorrect predictions.</p>
-</section><section id="over-smoothing" class="level2" data-number="28.3"><h2 data-number="28.3" class="anchored" data-anchor-id="over-smoothing">
-<span class="header-section-number">28.3</span> Over-smoothing</h2>
-<p>Although not as badly as with the previous examples, we saw that with <span class="math inline">\(k = 5\)</span> we also over-trained. Hence, we should consider a larger <span class="math inline">\(k\)</span>. Let’s try, as an example, a much larger number: <span class="math inline">\(k = 401\)</span>.</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-9_7310f82b056d732c75fe0603af3e8e05">
-<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">knn_fit_401</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/knn3.html">knn3</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>, k <span class="op">=</span> <span class="fl">401</span><span class="op">)</span></span>
+<p>The estimate <span class="math inline">\(\hat{p}(\mathbf{x})\)</span> follows the training data too closely (left). You can see that, in the training set, boundaries have been drawn to perfectly surround a single red point in a sea of blue. Because most points <span class="math inline">\(\mathbf{x}\)</span> are unique, the prediction is either 1 or 0 and the prediction for that point is the associated label. However, once we introduce the test set (right), we see that many of these small islands now have the opposite color and we end up making several incorrect predictions.</p>
+</section><section id="over-smoothing" class="level2" data-number="29.3"><h2 data-number="29.3" class="anchored" data-anchor-id="over-smoothing">
+<span class="header-section-number">29.3</span> Over-smoothing</h2>
+<p>Although not as badly as with <span class="math inline">\(k=1\)</span>, we saw that with <span class="math inline">\(k = 5\)</span> we also over-trained. Hence, we should consider a larger <span class="math inline">\(k\)</span>. Let’s try, as an example, a much larger number: <span class="math inline">\(k = 401\)</span>.</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-7_30f2f9d3483307a48dbb9da03601e38c">
+<div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">knn_fit_401</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/knn3.html">knn3</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>, k <span class="op">=</span> <span class="fl">401</span><span class="op">)</span></span>
 <span><span class="va">y_hat_knn_401</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">knn_fit_401</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat_knn_401</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
 <span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;     0.79</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt;     0.76</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This turns out to be similar to regression:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/mnist-27-glm-est_44cc9f7931c3b7c148659416047ad01c">
-<pre><code>#&gt; Warning: The following aesthetics were dropped during statistical
-#&gt; transformation: fill
-#&gt; ℹ This can happen when ggplot fails to infer the correct grouping
-#&gt;   structure in the data.
-#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a
-#&gt;   numerical variable into a factor?
-#&gt; The following aesthetics were dropped during statistical
-#&gt; transformation: fill
-#&gt; ℹ This can happen when ggplot fails to infer the correct grouping
-#&gt;   structure in the data.
-#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a
-#&gt;   numerical variable into a factor?</code></pre>
+<p>The estimate turns out to be similar to the one obtained with regression:</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/mnist-27-glm-est_2a4d9fd085f6b066eb5407e4c1edb684">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="cross-validation_files/figure-html/mnist-27-glm-est-1.png" class="img-fluid figure-img" style="width:100.0%"></p>
@@ -547,32 +534,12 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 </div>
 </div>
 </div>
-<p>This size of <span class="math inline">\(k\)</span> is so large that it does not permit enough flexibility. We call this <em>over-smoothing</em>.</p>
-</section><section id="picking-the-k-in-knn" class="level2" data-number="28.4"><h2 data-number="28.4" class="anchored" data-anchor-id="picking-the-k-in-knn">
-<span class="header-section-number">28.4</span> Picking the <span class="math inline">\(k\)</span> in kNN</h2>
-<p>So how do we pick <span class="math inline">\(k\)</span>? In principle we want to pick the <span class="math inline">\(k\)</span> that maximizes accuracy, or minimizes the expected MSE as defined in <a href="evaluation-metrics.html#sec-loss-function"><span>Section&nbsp;25.8</span></a>. The goal of cross validation is to estimate these quantities for any given algorithm and set of tuning parameters such as <span class="math inline">\(k\)</span>. To understand why we need a special method to do this let’s repeat what we did above but for different values of <span class="math inline">\(k\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-10_dd0bcd9ac1f069214310b76a7bebc50b">
-<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">ks</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">3</span>, <span class="fl">251</span>, <span class="fl">2</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>We do this using <code>map_df</code> function to repeat the above for each one.</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-11_8dd026aef2fdf1a3d2983ebba9acbd6c">
-<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://purrr.tidyverse.org/">purrr</a></span><span class="op">)</span></span>
-<span><span class="va">accuracy</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://purrr.tidyverse.org/reference/map_dfr.html">map_df</a></span><span class="op">(</span><span class="va">ks</span>, <span class="kw">function</span><span class="op">(</span><span class="va">k</span><span class="op">)</span><span class="op">{</span></span>
-<span>  <span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/knn3.html">knn3</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>, k <span class="op">=</span> <span class="va">k</span><span class="op">)</span></span>
-<span>  </span>
-<span>  <span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
-<span>  <span class="va">cm_train</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">y</span><span class="op">)</span></span>
-<span>  <span class="va">train_error</span> <span class="op">&lt;-</span> <span class="va">cm_train</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
-<span>  </span>
-<span>  <span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
-<span>  <span class="va">cm_test</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span></span>
-<span>  <span class="va">test_error</span> <span class="op">&lt;-</span> <span class="va">cm_test</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
-<span>  </span>
-<span>  <span class="fu"><a href="https://tibble.tidyverse.org/reference/tibble.html">tibble</a></span><span class="op">(</span>train <span class="op">=</span> <span class="va">train_error</span>, test <span class="op">=</span> <span class="va">test_error</span><span class="op">)</span></span>
-<span><span class="op">}</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Note that we estimate accuracy by using both the training set and the test set. We can now plot the accuracy estimates for each value of <span class="math inline">\(k\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/accuracy-vs-k-knn_231053065e75af822a80ad65081c1f07">
+<p>In this case, <span class="math inline">\(k\)</span> is so large that it does not permit enough flexibility. We call this <em>over-smoothing</em>.</p>
+</section><section id="parameter-tuning" class="level2" data-number="29.4"><h2 data-number="29.4" class="anchored" data-anchor-id="parameter-tuning">
+<span class="header-section-number">29.4</span> Parameter tuning</h2>
+<p>It is very common for machine learning algorithms to require that we set a value, or values, before we even fit the model. An example is the <span class="math inline">\(k\)</span> in kNN. In <a href="algorithms.html"><span>Chapter&nbsp;30</span></a> we learn of other examples. These values are referred to as <em>parameters</em> and an important step in machine learning in practice is picking or <em>tunning</em> those parameters.</p>
+<p>So how do we tune these parameters? For example, how do we pick the <span class="math inline">\(k\)</span> in kNN? In principle, we want to pick the <span class="math inline">\(k\)</span> that maximizes accuracy or minimizes the expected MSE as defined in <a href="evaluation-metrics.html#sec-mse"><span>Section&nbsp;26.8</span></a>. The goal of resampling methods is to estimate these quantities for any given algorithm and set of tuning parameters such as <span class="math inline">\(k\)</span>. To understand why we need a special method to do this, let’s repeat what we did above, comparing the training set and test set accuracy, but for different values of <span class="math inline">\(k\)</span>. We can plot the accuracy estimates for each value of <span class="math inline">\(k\)</span>:</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/accuracy-vs-k-knn_6de403955de81deab91bc3536da80ae2">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="cross-validation_files/figure-html/accuracy-vs-k-knn-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -580,40 +547,31 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 </div>
 </div>
 </div>
-<p>First, note that the estimate obtained on the training set is generally lower than the estimate obtained with the test set, with the difference larger for smaller values of <span class="math inline">\(k\)</span>. This is due to over-training. Also note that the accuracy versus <span class="math inline">\(k\)</span> plot is quite jagged. We do not expect this because small changes in <span class="math inline">\(k\)</span> should not affect the algorithm’s performance too much. The jaggedness is explained by the fact that the accuracy is computed on a sample and therefore is a random variable. This demonstrates why we prefer to minimize the expected loss rather than the loss we observe with one dataset.</p>
-<p>If we were to use these estimates to pick the <span class="math inline">\(k\)</span> that maximizes accuracy, we would use the estimates built on the test data:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-12_10df8b7f779f4a3eea7746c6ebc6205f">
-<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">ks</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/which.min.html">which.max</a></span><span class="op">(</span><span class="va">accuracy</span><span class="op">$</span><span class="va">test</span><span class="op">)</span><span class="op">]</span></span>
-<span><span class="co">#&gt; [1] 41</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="va">accuracy</span><span class="op">$</span><span class="va">test</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 0.86</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Another reason we need a better estimate of accuracy is that if we use the test set to pick this <span class="math inline">\(k\)</span>, we should not expect the accompanying accuracy estimate to extrapolate to the real world. This is because even here we broke a golden rule of machine learning: we selected the <span class="math inline">\(k\)</span> using the test set. Cross validation also provides an estimate that takes this into account.</p>
-</section><section id="mathematical-description-of-cross-validation" class="level2" data-number="28.5"><h2 data-number="28.5" class="anchored" data-anchor-id="mathematical-description-of-cross-validation">
-<span class="header-section-number">28.5</span> Mathematical description of cross validation</h2>
-<p>In Section <a href="evaluation-metrics.html#sec-loss-function"><span>Section&nbsp;25.8</span></a>, we described that a common goal of machine learning is to find an algorithm that produces predictors <span class="math inline">\(\hat{Y}\)</span> for an outcome <span class="math inline">\(Y\)</span> that minimizes the MSE:</p>
-<p><span class="math display">\[
-\mbox{MSE} = \mbox{E}\left\{ \frac{1}{N}\sum_{i = 1}^N (\hat{Y}_i - Y_i)^2 \right\}
-\]</span></p>
-<p>When all we have at our disposal is one dataset, we can estimate the MSE with the observed MSE like this:</p>
-<p><span class="math display">\[
-\hat{\mbox{MSE}} = \frac{1}{N}\sum_{i = 1}^N (\hat{y}_i - y_i)^2
-\]</span></p>
-<p>These two are often referred to as the <em>true error</em> and <em>apparent error</em>, respectively.</p>
-<p>There are two important characteristics of the apparent error we should always keep in mind:</p>
+<p>First, note that the estimate obtained on the training set is generally lower than the estimate obtained with the test set, with the difference larger for smaller values of <span class="math inline">\(k\)</span>. This is due to over-training.</p>
+<p>So do we simply pick the <span class="math inline">\(k\)</span> that maximizes accuracy and report this accuracy? There are two problems with this approach:</p>
 <ol type="1">
-<li><p>Because our data is random, the apparent error is a random variable. For example, the dataset we have may be a random sample from a larger population. An algorithm may have a lower apparent error than another algorithm due to luck.</p></li>
-<li><p>If we train an algorithm on the same dataset that we use to compute the apparent error, we might be overtraining. In general, when we do this, the apparent error will be an underestimate of the true error. We will see an extreme example of this with k-nearest neighbors.</p></li>
+<li><p>The accuracy versus <span class="math inline">\(k\)</span> plot is quite jagged. We do not expect this because small changes in <span class="math inline">\(k\)</span> should not affect the algorithm’s performance so much. The jaggedness is explained by the fact that the accuracy is computed on a sample and therefore is a random variable.</p></li>
+<li><p>Although for each <span class="math inline">\(k\)</span> we estimated MSE using the test set, we used the test set to pick the best <span class="math inline">\(k\)</span>. As a result, we should not expect this minimum test set accuracy to extrapolate to the real world.</p></li>
 </ol>
-<p>Cross validation is a technique that permits us to alleviate both these problems. To understand cross validation, it helps to think of the true error, a theoretical quantity, as the average of many apparent errors obtained by applying the algorithm to <span class="math inline">\(B\)</span> new random samples of the data, none of them used to train the algorithm. As shown in a previous chapter, we think of the true error as:</p>
+<p>Resampling methods provide a solution to both these problems.</p>
+</section><section id="mathematical-description-of-resampling-methods" class="level2" data-number="29.5"><h2 data-number="29.5" class="anchored" data-anchor-id="mathematical-description-of-resampling-methods">
+<span class="header-section-number">29.5</span> Mathematical description of resampling methods</h2>
+<p>In the previous section, we introduced kNN as an example to motivate the topic of this chapter. In this particular case, there is just one parameter, <span class="math inline">\(k\)</span>, that affects the performance of the algorithm. However, in general, machine algorithms may have multiple parameters so we use the notation <span class="math inline">\(\lambda\)</span> to represent any set of parameters needed to define a machine learning algorithm. We also introduce notation to distinguish the predictions you get with each set of parameters with <span class="math inline">\(\hat{y}(\lambda)\)</span> and the MSE for this choice with <span class="math inline">\(\text{MSE}(\lambda)\)</span>. Our goal is to find the <span class="math inline">\(\lambda\)</span> that minimizes <span class="math inline">\(\text{MSE}(\lambda)\)</span>. Resampling method help us estimate <span class="math inline">\(\text{MSE}(\lambda)\)</span>.</p>
+<p>A intuitive first attempt is the apparent error defined in <a href="evaluation-metrics.html#sec-mse"><span>Section&nbsp;26.8</span></a> and used in the previous section:</p>
 <p><span class="math display">\[
-\frac{1}{B} \sum_{b = 1}^B \frac{1}{N}\sum_{i = 1}^N \left(\hat{y}_i^b - y_i^b\right)^2
+\hat{\mbox{MSE}}(\lambda) = \frac{1}{N}\sum_{i = 1}^N \left\{\hat{y}_i(\lambda) - y_i\right\}^2
 \]</span></p>
-<p>with <span class="math inline">\(B\)</span> a large number that can be thought of as practically infinite. As already mentioned, this is a theoretical quantity because we only have available one set of outcomes: <span class="math inline">\(y_1, \dots, y_n\)</span>. Cross validation is based on the idea of imitating the theoretical setup above as best we can with the data we have. To do this, we have to generate a series of different random samples. There are several approaches we can use, but the general idea for all of them is to randomly generate smaller datasets that are not used for training, and instead used to estimate the true error.</p>
-</section><section id="k-fold-cross-validation" class="level2" data-number="28.6"><h2 data-number="28.6" class="anchored" data-anchor-id="k-fold-cross-validation">
-<span class="header-section-number">28.6</span> K-fold cross validation</h2>
-<p>The first one we describe is <em>K-fold cross validation</em>. Generally speaking, a machine learning challenge starts with a dataset (blue in the image below). We need to build an algorithm using this dataset that will eventually be used in completely independent datasets (yellow).</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-14_c5201aab86c684250a8767edd683f4bb">
+<p>As noted in the previous section, this estimate is a random error, based on just one test set, with enough variability to affect the choice of the best <span class="math inline">\(\lambda\)</span> substantially.</p>
+<p>Now imagine a world in which we could obtain data repeatedly, say from new random samples. We could take a very large number <span class="math inline">\(B\)</span> of new samples, split them into training and test sets, and define:</p>
+<p><span class="math display">\[
+\frac{1}{B} \sum_{b=1}^B \frac{1}{N}\sum_{i=1}^N \left\{\hat{y}_i^b(\lambda) - y_i^b\right\}^2
+\]</span></p>
+<p>with <span class="math inline">\(y_i^b\)</span> the <span class="math inline">\(i\)</span>th observation in sample <span class="math inline">\(b\)</span> and <span class="math inline">\(\hat{y}_{i}^b(\lambda)\)</span> the prediction obtained with the algorithm defined by parameter <span class="math inline">\(\lambda\)</span> and trained independently of <span class="math inline">\(y_i^b\)</span>. The law of large numbers tells us that as <span class="math inline">\(B\)</span> becomes larger, this quantity gets closer and closer to <span class="math inline">\(MSE(\lambda)\)</span>. This is of course a theoretical consideration as we rarely get access to more than one dataset for algorithm development, but the concept inspires the idea behind resampling methods.</p>
+<p>The general idea is to generate a series of different random samples from the data at hand. There are several approaches to doing this, but all randomly generate several smaller datasets that are not used for training, and instead are used to estimate MSE. Next, we describe <em>cross validation</em>, one of the most widely used resampling resampling methods.</p>
+</section><section id="cross-validation" class="level2" data-number="29.6"><h2 data-number="29.6" class="anchored" data-anchor-id="cross-validation">
+<span class="header-section-number">29.6</span> Cross validation</h2>
+<p>Overall, we are provided a dataset (blue) and we need to build an algorithm, using this dataset, that will eventually be used in completely independent datasets (yellow) that we might not even see.</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-10_89885cb058a194209bb6c0aff423a2c0">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="img/cv-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -621,18 +579,9 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 </div>
 </div>
 </div>
-<p>But we don’t get to see these independent datasets.</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-15_c95124544f668535b03b5d1e2e4010b7">
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="img/cv-2.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>So to imitate this situation, we carve out a piece of our dataset and pretend it is an independent dataset: we divide the dataset into a <em>training set</em> (blue) and a <em>test set</em> (red). We will train our algorithm exclusively on the training set and use the test set only for evaluation purposes.</p>
-<p>We usually try to select a small piece of the dataset so that we have as much data as possible to train. However, we also want the test set to be large so that we obtain a stable estimate of the loss without fitting an impractical number of models. Typical choices are to use 10%-20% of the data for testing.</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-16_dce6c09edf0180a601101e6db102e67a">
+<p>So to imitate this situation, we start by carving out a piece of our dataset and pretend it is an independent dataset: we divide the dataset into a <em>training set</em> (blue) and a <em>test set</em> (red). We will train the entirety of our algorithm, including the choice of parameter <span class="math inline">\(\lambda\)</span>, exclusively on the training set and use the test set only for evaluation purposes.</p>
+<p>We usually try to select a small piece of the dataset so that we have as much data as possible to train. However, we also want the test set to be large so that we obtain a stable estimate of the MSE without fitting an impractical number of models. Typical choices are to use 10%-20% of the data for testing.</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-11_88220119a7f81b245a97cc6f041d0774">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="img/cv-3.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -640,17 +589,9 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 </div>
 </div>
 </div>
-<p>Let’s reiterate that it is indispensable that we not use the test set at all: not for filtering out rows, not for selecting features, nothing!</p>
-<p>Now this presents a new problem because for most machine learning algorithms we need to select parameters, for example the number of neighbors <span class="math inline">\(k\)</span> in k-nearest neighbors. Here, we will refer to the set of parameters as <span class="math inline">\(\lambda\)</span>. We need to optimize algorithm parameters without using our test set and we know that if we optimize and evaluate on the same dataset, we will overtrain. This is where cross validation is most useful.</p>
-<p>For each set of algorithm parameters being considered, we want an estimate of the MSE and then we will choose the parameters with the smallest MSE. Cross validation provides this estimate.</p>
-<p>First, before we start the cross validation procedure, it is important to fix all the algorithm parameters. Although we will train the algorithm on the set of training sets, the parameters <span class="math inline">\(\lambda\)</span> will be the same across all training sets. We will use <span class="math inline">\(\hat{y}_i(\lambda)\)</span> to denote the predictors obtained when we use parameters <span class="math inline">\(\lambda\)</span>.</p>
-<p>So, if we are going to imitate this definition:</p>
-<p><span class="math display">\[
-\mbox{MSE}(\lambda) = \frac{1}{B} \sum_{b = 1}^B \frac{1}{N}\sum_{i = 1}^N \left(\hat{y}_i^b(\lambda) - y_i^b\right)^2
-\]</span></p>
-<p>we want to consider datasets that can be thought of as an independent random sample and we want to do this several times. With K-fold cross validation, we do it <span class="math inline">\(K\)</span> times. In the cartoons, we are showing an example that uses <span class="math inline">\(K = 5\)</span>.</p>
-<p>We will eventually end up with <span class="math inline">\(K\)</span> samples, but let’s start by describing how to construct the first: we simply pick <span class="math inline">\(M = N/K\)</span> observations at random (we round if <span class="math inline">\(M\)</span> is not a round number) and think of these as a random sample <span class="math inline">\(y_1^b, \dots, y_M^b\)</span>, with <span class="math inline">\(b = 1\)</span>. We call this the validation set:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-17_1a326b4c7b4ba6788b77c5ad12114e00">
+<p>Let’s reiterate that it is indispensable that we not use the test set at all: not for filtering out rows, not for selecting features, not for anything!</p>
+<p>But then how do we optimize <span class="math inline">\(\lambda\)</span>? In cross validation, we achieve this by splitting the training set into two: the training and validation sets.</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-12_69bbf825738e69f5085f24b703f346dd">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="img/cv-4.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -658,12 +599,21 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 </div>
 </div>
 </div>
+<p>We will do this many times assuring that the estimates of MSE obtained in each dataset are independent from each other. There are several proposed methods for doing this. Here we describe one of these approaches, K-fold cross validation, in detail to provide the general idea used in all approaches.</p>
+<section id="k-fold-cross-validation" class="level3" data-number="29.6.1"><h3 data-number="29.6.1" class="anchored" data-anchor-id="k-fold-cross-validation">
+<span class="header-section-number">29.6.1</span> K-fold cross validation</h3>
+<p>As a reminder, we are going to imitate the concept used when introducing this version of the MSE:</p>
+<p><span class="math display">\[
+\mbox{MSE}(\lambda) \approx\frac{1}{B} \sum_{b = 1}^B \frac{1}{N}\sum_{i = 1}^N \left(\hat{y}_i^b(\lambda) - y_i^b\right)^2
+\]</span></p>
+<p>We want to generate a dataset that can be thought of as independent random sample, and do this <span class="math inline">\(B\)</span> times. The K in K-fold cross validation, represents the number of time <span class="math inline">\(B\)</span>. In the illustrations, we are showing an example that uses <span class="math inline">\(B = 5\)</span>.</p>
+<p>We will eventually end up with <span class="math inline">\(B\)</span> samples, but let’s start by describing how to construct the first: we simply pick <span class="math inline">\(M = N/B\)</span> observations at random (we round if <span class="math inline">\(M\)</span> is not a round number) and think of these as a random sample <span class="math inline">\(y_1^b, \dots, y_M^b\)</span>, with <span class="math inline">\(b = 1\)</span>. We call this the validation set.</p>
 <p>Now we can fit the model in the training set, then compute the apparent error on the independent set:</p>
 <p><span class="math display">\[
 \hat{\mbox{MSE}}_b(\lambda) = \frac{1}{M}\sum_{i = 1}^M \left(\hat{y}_i^b(\lambda) - y_i^b\right)^2
 \]</span></p>
-<p>Note that this is just one sample and will therefore return a noisy estimate of the true error. This is why we take <span class="math inline">\(K\)</span> samples, not just one. In K-cross validation, we randomly split the observations into <span class="math inline">\(K\)</span> non-overlapping sets:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-18_6a81155a5d22cef8fc2af960a6a6e38c">
+<p>As a reminder, this is just one sample and will therefore return a noisy estimate of the true error. In K-fold cross validation, we randomly split the observations into <span class="math inline">\(B\)</span> non-overlapping sets:</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-13_70cf2baf55f141aa186b8bc34a8df667">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="img/cv-5.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -671,13 +621,19 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 </div>
 </div>
 </div>
-<p>Now we repeat the calculation above for each of these sets <span class="math inline">\(b = 1,\dots,K\)</span> and obtain <span class="math inline">\(\hat{\mbox{MSE}}_1(\lambda),\dots, \hat{\mbox{MSE}}_K(\lambda)\)</span>. Then, for our final estimate, we compute the average:</p>
+<p>Now we repeat the calculation above for each of these sets <span class="math inline">\(b = 1,\dots,B\)</span> and obtain <span class="math inline">\(\hat{\mbox{MSE}}_1(\lambda),\dots, \hat{\mbox{MSE}}_B(\lambda)\)</span>. Then, for our final estimate, we compute the average:</p>
 <p><span class="math display">\[
-\hat{\mbox{MSE}}(\lambda) = \frac{1}{K} \sum_{b = 1}^K \hat{\mbox{MSE}}_b(\lambda)
+\hat{\mbox{MSE}}(\lambda) = \frac{1}{B} \sum_{b = 1}^B \hat{\mbox{MSE}}_b(\lambda)
 \]</span></p>
 <p>and obtain an estimate of our loss. A final step would be to select the <span class="math inline">\(\lambda\)</span> that minimizes the MSE.</p>
-<p>We have described how to use cross validation to optimize parameters. However, we now have to take into account the fact that the optimization occurred on the training data and therefore we need an estimate of our final algorithm based on data that was not used to optimize the choice. Here is where we use the test set we separated early on:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-19_0c91093c7ffd2481c12043473343c32c">
+</section><section id="how-many-folds" class="level3" data-number="29.6.2"><h3 data-number="29.6.2" class="anchored" data-anchor-id="how-many-folds">
+<span class="header-section-number">29.6.2</span> How many folds?</h3>
+<p>Now how do we pick the cross validation fold? Large values of <span class="math inline">\(B\)</span> are preferable because the training data better imitates the original dataset. However, larger values of <span class="math inline">\(B\)</span> will have much slower computation time: for example, 100-fold cross validation will be 10 times slower than 10-fold cross validation. For this reason, the choices of <span class="math inline">\(B = 5\)</span> and <span class="math inline">\(B = 10\)</span> are popular.</p>
+<p>One way we can improve the variance of our final estimate is to take more samples. To do this, we would no longer require the training set to be partitioned into non-overlapping sets. Instead, we would just pick <span class="math inline">\(B\)</span> sets of some size at random.</p>
+</section><section id="estimate-mse-of-our-optimized-algorithm" class="level3" data-number="29.6.3"><h3 data-number="29.6.3" class="anchored" data-anchor-id="estimate-mse-of-our-optimized-algorithm">
+<span class="header-section-number">29.6.3</span> Estimate MSE of our optimized algorithm</h3>
+<p>We have described how to use cross validation to optimize parameters. However, we now have to take into account the fact that the optimization occurred on the training data and we therefore need an estimate of our final algorithm based on data that was not used to optimize the choice. Here is where we use the test set we separated early on:</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-14_9de04140217fc5b6c5c8ceba3f0272e7">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="img/cv-6.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -685,8 +641,8 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 </div>
 </div>
 </div>
-<p>We can do cross validation again:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-20_c6b661ab730031c6d5f494435cffb0ae">
+<p>We can actually do cross validation again:</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-15_5a6b7623fb1f791edd57e5766fc400ea">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="img/cv-7.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -694,9 +650,9 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 </div>
 </div>
 </div>
-<p>and obtain a final estimate of our expected loss. However, note that this means that our entire compute time gets multiplied by <span class="math inline">\(K\)</span>. You will soon learn that performing this task takes time because we are performing many complex computations. As a result, we are always looking for ways to reduce this time. For the final evaluation, we often just use the one test set.</p>
+<p>and obtain a final estimate of our expected loss. However, note that last cross validation iteration means that our entire compute time gets multiplied by <span class="math inline">\(K\)</span>. You will soon learn that fitting each algorithm takes time because we are performing many complex computations. As a result, we are always looking for ways to reduce this time. For the final evaluation, we often just use the one test set.</p>
 <p>Once we are satisfied with this model and want to make it available to others, we could refit the model on the entire dataset, without changing the optimized parameters.</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-21_a239bdb60dd8ee8d982ae9d528632bfe">
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-16_9f6060a4ed6f7f52f132042e06995b95">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="img/cv-8.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -704,101 +660,46 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 </div>
 </div>
 </div>
-<p>Now how do we pick the cross validation <span class="math inline">\(K\)</span>? Large values of <span class="math inline">\(K\)</span> are preferable because the training data better imitates the original dataset. However, larger values of <span class="math inline">\(K\)</span> will have much slower computation time: for example, 100-fold cross validation will be 10 times slower than 10-fold cross validation. For this reason, the choices of <span class="math inline">\(K = 5\)</span> and <span class="math inline">\(K = 10\)</span> are popular.</p>
-<p>One way we can improve the variance of our final estimate is to take more samples. To do this, we would no longer require the training set to be partitioned into non-overlapping sets. Instead, we would just pick <span class="math inline">\(K\)</span> sets of some size at random.</p>
-<p>One popular version of this technique, at each fold, picks observations at random with replacement (which means the same observation can appear twice). This approach has some advantages (not discussed here) and is generally referred to as the <em>bootstrap</em>. In fact, this is the default approach in the <strong>caret</strong> package. We describe how to implement cross validation with the <strong>caret</strong> package in the next chapter. In the next section, we include an explanation of how the bootstrap works in general.</p>
-<div class="callout callout-style-simple callout-note">
-<div class="callout-body d-flex">
-<div class="callout-icon-container">
-<i class="callout-icon"></i>
-</div>
-<div class="callout-body-container">
-<p>You are ready to try exercises 1 through 8</p>
-</div>
-</div>
-</div>
-</section><section id="bootstrap" class="level2" data-number="28.7"><h2 data-number="28.7" class="anchored" data-anchor-id="bootstrap">
-<span class="header-section-number">28.7</span> Bootstrap</h2>
-<p>Suppose the income distribution of your population is as follows:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/income-distribution_2bff83435ee1feb792a3c4a3d5869ada">
-<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1995</span><span class="op">)</span></span>
-<span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">10</span><span class="op">^</span><span class="fl">6</span></span>
-<span><span class="va">income</span> <span class="op">&lt;-</span> <span class="fl">10</span><span class="op">^</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">n</span>, <span class="fu"><a href="https://rdrr.io/r/base/Log.html">log10</a></span><span class="op">(</span><span class="fl">45000</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/Log.html">log10</a></span><span class="op">(</span><span class="fl">3</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/qplot.html">qplot</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Log.html">log10</a></span><span class="op">(</span><span class="va">income</span><span class="op">)</span>, bins <span class="op">=</span> <span class="fl">30</span>, color <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/AsIs.html">I</a></span><span class="op">(</span><span class="st">"black"</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="co">#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="cross-validation_files/figure-html/income-distribution-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>The population median is:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-23_210a1018fd2a8f75df1dce4956fe4e56">
-<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">m</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/median.html">median</a></span><span class="op">(</span><span class="va">income</span><span class="op">)</span></span>
-<span><span class="va">m</span></span>
-<span><span class="co">#&gt; [1] 44939</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Suppose we don’t have access to the entire population, but want to estimate the median <span class="math inline">\(m\)</span>. We take a sample of 100 and estimate the population median <span class="math inline">\(m\)</span> with the sample median <span class="math inline">\(M\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-24_ee4e8ea3c3e7442f844e0afdc0c76564">
-<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">N</span> <span class="op">&lt;-</span> <span class="fl">100</span></span>
-<span><span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">income</span>, <span class="va">N</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/stats/median.html">median</a></span><span class="op">(</span><span class="va">X</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 38461</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Can we construct a confidence interval? What is the distribution of <span class="math inline">\(M\)</span> ?</p>
-<p>Because we are simulating the data, we can use a Monte Carlo simulation to learn the distribution of <span class="math inline">\(M\)</span>.</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/median-is-normal_a1220c5335991c7e4070ee641fa2e904">
-<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">gridExtra</span><span class="op">)</span></span>
-<span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10</span><span class="op">^</span><span class="fl">4</span></span>
-<span><span class="va">M</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="op">{</span></span>
-<span>  <span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">income</span>, <span class="va">N</span><span class="op">)</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/stats/median.html">median</a></span><span class="op">(</span><span class="va">X</span><span class="op">)</span></span>
-<span><span class="op">}</span><span class="op">)</span></span>
-<span><span class="va">p1</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/qplot.html">qplot</a></span><span class="op">(</span><span class="va">M</span>, bins <span class="op">=</span> <span class="fl">30</span>, color <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/AsIs.html">I</a></span><span class="op">(</span><span class="st">"black"</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="va">p2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/qplot.html">qplot</a></span><span class="op">(</span>sample <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/scale.html">scale</a></span><span class="op">(</span><span class="va">M</span><span class="op">)</span>, xlab <span class="op">=</span> <span class="st">"theoretical"</span>, ylab <span class="op">=</span> <span class="st">"sample"</span><span class="op">)</span> <span class="op">+</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_abline.html">geom_abline</a></span><span class="op">(</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/pkg/gridExtra/man/arrangeGrob.html">grid.arrange</a></span><span class="op">(</span><span class="va">p1</span>, <span class="va">p2</span>, ncol <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section></section><section id="boostrap-resampling" class="level2" data-number="29.7"><h2 data-number="29.7" class="anchored" data-anchor-id="boostrap-resampling">
+<span class="header-section-number">29.7</span> Boostrap resampling</h2>
+<p>Typically, cross-validation involves partitioning the original dataset into a training set to train the model and a testing set to evaluate it. With the bootstrap approach, based on the ideas described in <a href="../inference/bootstrap.html"><span>Chapter&nbsp;10</span></a>, you can create multiple different training datasets via bootstrapping. This method is sometimes called bootstrap aggregating or bagging.</p>
+<p>In bootstrap resampling, we create a large number of bootstrap samples from the original training dataset. Each bootstrap sample is created by randomly selecting observations with replacement, usually the same size as the original training dataset. For each bootstrap sample, we fit the model and compute the MSE estimate on the observations not selected in the random sampling, referred to as the <em>out-of-bag observations</em>. These out-of-bag observations serve a similar role to a validation set in standard cross-validation.</p>
+<p>We then average the MSEs obtained in the out-of-bag observations from each bootstrap sample to estimate the model’s performance.</p>
+<p>This approach is actually the default approach in the <strong>caret</strong> package. We describe how to implement resampling methods with the <strong>caret</strong> package in the next chapter.</p>
+<section id="sec-mse-estimates" class="level3" data-number="29.7.1"><h3 data-number="29.7.1" class="anchored" data-anchor-id="sec-mse-estimates">
+<span class="header-section-number">29.7.1</span> Comparison of MSE estimates</h3>
+<p>In <a href="#sec-knn-cv-intro"><span>Section&nbsp;29.1</span></a>, we computed an estimate of MSE based just on the provided test set (shown in red in the plot below). Here we show how the cross-validation techniques described above help reduce variability. The green curve below shows the results of applying K-fold cross validation with 10 folds, leaving out 10% of the data for validation. We can see that the variance is reduced substantially. The blue curve is the result of using 100 bootstrap samples to estimate MSE. The variability is reduced even further, but at the cost of a 10 fold increase in computation time.</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-18_d4407987cd895ea16a7d122430da2471">
+<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">2023</span><span class="op">-</span><span class="fl">11</span><span class="op">-</span><span class="fl">30</span><span class="op">)</span></span>
+<span><span class="va">boot</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span><span class="op">~</span><span class="va">.</span>, method <span class="op">=</span> <span class="st">"knn"</span>, tuneGrid <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>k<span class="op">=</span><span class="va">ks</span><span class="op">)</span>, </span>
+<span>              data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>, </span>
+<span>              trControl <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/trainControl.html">trainControl</a></span><span class="op">(</span>number <span class="op">=</span> <span class="fl">100</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">cv</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"knn"</span>, </span>
+<span>            tuneGrid <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>k <span class="op">=</span> <span class="va">ks</span><span class="op">)</span>, </span>
+<span>            data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>,</span>
+<span>            trControl <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/trainControl.html">trainControl</a></span><span class="op">(</span>method <span class="op">=</span> <span class="st">"cv"</span>, </span>
+<span>                                     number <span class="op">=</span> <span class="fl">10</span>, p <span class="op">=</span> <span class="fl">.9</span><span class="op">)</span><span class="op">)</span></span>
+<span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>k <span class="op">=</span> <span class="va">ks</span>, naive <span class="op">=</span> <span class="va">accuracy</span><span class="op">[</span><span class="st">"test"</span>,<span class="op">]</span>, </span>
+<span>           cv <span class="op">=</span> <span class="va">cv</span><span class="op">$</span><span class="va">results</span><span class="op">[</span>,<span class="fl">2</span><span class="op">]</span>,</span>
+<span>           boot <span class="op">=</span> <span class="va">boot</span><span class="op">$</span><span class="va">results</span><span class="op">[</span>,<span class="fl">2</span><span class="op">]</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu">pivot_longer</span><span class="op">(</span><span class="op">-</span><span class="va">k</span>, values_to <span class="op">=</span> <span class="st">"accuracy"</span>, names_to <span class="op">=</span> <span class="st">"set"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu">mutate</span><span class="op">(</span>set <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">set</span>, levels <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"naive"</span>, <span class="st">"cv"</span>, <span class="st">"boot"</span><span class="op">)</span>,</span>
+<span>                      labels <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Simple"</span>, <span class="st">"K-fold"</span>, <span class="st">"Boostrap"</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">k</span>, <span class="va">accuracy</span>, color <span class="op">=</span> <span class="va">set</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
+<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_path.html">geom_line</a></span><span class="op">(</span><span class="op">)</span> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="cross-validation_files/figure-html/median-is-normal-1.png" class="img-fluid figure-img" style="width:100.0%"></p>
+<figure class="figure"><p><img src="cross-validation_files/figure-html/unnamed-chunk-18-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
 </figure>
 </div>
 </div>
 </div>
-<p>If we know this distribution, we can construct a confidence interval. The problem here is that, as we have already described, in practice we do not have access to the distribution. In the past, we have used the Central Limit Theorem, but the CLT we studied applies to averages and here we are interested in the median. We can see that the 95% confidence interval based on CLT</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-25_f2795b7e01c7cf87f3f90d713b1dbe82">
-<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/median.html">median</a></span><span class="op">(</span><span class="va">X</span><span class="op">)</span> <span class="op">+</span> <span class="fl">1.96</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">X</span><span class="op">)</span> <span class="op">/</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">N</span><span class="op">)</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>, <span class="fl">1</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 21018 55905</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>is quite different from the confidence interval we would generate if we know the actual distribution of <span class="math inline">\(M\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-26_5f992008ffa609ee79cb7c32b59be4f3">
-<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/quantile.html">quantile</a></span><span class="op">(</span><span class="va">M</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0.025</span>, <span class="fl">0.975</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="co">#&gt;  2.5% 97.5% </span></span>
-<span><span class="co">#&gt; 34438 59050</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>The bootstrap permits us to approximate a Monte Carlo simulation without access to the entire distribution. The general idea is relatively simple. We act as if the observed sample is the population. We then sample (with replacement) datasets, of the same sample size as the original dataset. Then we compute the summary statistic, in this case the median, on these <em>bootstrap samples</em>.</p>
-<p>Theory tells us that, in many situations, the distribution of the statistics obtained with bootstrap samples approximate the distribution of our actual statistic. This is how we construct bootstrap samples and an approximate distribution:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-27_53a59442cddddc3d44d9ab6c49d0823f">
-<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10</span><span class="op">^</span><span class="fl">4</span></span>
-<span><span class="va">M_star</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="op">{</span></span>
-<span>  <span class="va">X_star</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">X</span>, <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/stats/median.html">median</a></span><span class="op">(</span><span class="va">X_star</span><span class="op">)</span></span>
-<span><span class="op">}</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Note a confidence interval constructed with the bootstrap is much closer to one constructed with the theoretical distribution:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-28_86761fdabd164a81308698c23111cd89">
-<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/quantile.html">quantile</a></span><span class="op">(</span><span class="va">M_star</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0.025</span>, <span class="fl">0.975</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="co">#&gt;  2.5% 97.5% </span></span>
-<span><span class="co">#&gt; 30253 56909</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>For more on the Bootstrap, including corrections one can apply to improve these confidence intervals, please consult the book <em>An introduction to the bootstrap</em> by Efron, B., &amp; Tibshirani, R. J.</p>
-<p><em>Note that we can use ideas similar to those used in the bootstrap in cross validation: instead of dividing the data into equal partitions, we simply bootstrap many times.</em></p>
-</section><section id="exercises" class="level2" data-number="28.8"><h2 data-number="28.8" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">28.8</span> Exercises</h2>
+</section></section><section id="exercises" class="level2" data-number="29.8"><h2 data-number="29.8" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">29.8</span> Exercises</h2>
 <p>Generate a set of random predictors and outcomes like this:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-29_c1ce36264a962fb26dc81779a80afeb6">
-<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1996</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-19_b41aafa1dc9e20ec3d9bc131b1e67cee">
+<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1996</span><span class="op">)</span></span>
 <span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
 <span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fl">10000</span></span>
 <span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">n</span> <span class="op">*</span> <span class="va">p</span><span class="op">)</span>, <span class="va">n</span>, <span class="va">p</span><span class="op">)</span></span>
@@ -808,9 +709,9 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 <span><span class="va">x_subset</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[</span> ,<span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">p</span>, <span class="fl">100</span><span class="op">)</span><span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>1. Because <code>x</code> and <code>y</code> are completely independent, you should not be able to predict <code>y</code> using <code>x</code> with accuracy larger than 0.5. Confirm this by running cross validation using logistic regression to fit the model. Because we have so many predictors, we selected a random sample <code>x_subset</code>. Use the subset when training the model. Hint: use the caret <code>train</code> function. The <code>results</code> component of the output of <code>train</code> shows you the accuracy. Ignore the warnings.</p>
-<p>2. Now, instead of a random selection of predictors, we are going to search for those that are most predictive of the outcome. We can do this by comparing the values for the <span class="math inline">\(y = 1\)</span> group to those in the <span class="math inline">\(y = 0\)</span> group, for each predictor, using a t-test. You can perform this step like this:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-30_3bb5d29eec035d1a278a162b7b087879">
-<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">devtools</span><span class="fu">::</span><span class="fu"><a href="https://remotes.r-lib.org/reference/install_bioc.html">install_bioc</a></span><span class="op">(</span><span class="st">"genefilter"</span><span class="op">)</span></span>
+<p>2. Now instead of a random selection of predictors, we are going to search for those that are most predictive of the outcome. We can do this by comparing the values for the <span class="math inline">\(y = 1\)</span> group to those in the <span class="math inline">\(y = 0\)</span> group, for each predictor, using a t-test. You can perform this step as follows:</p>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-20_b34fde006a2514b8c53f090fae906ba8">
+<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">devtools</span><span class="fu">::</span><span class="fu"><a href="https://remotes.r-lib.org/reference/install_bioc.html">install_bioc</a></span><span class="op">(</span><span class="st">"genefilter"</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/utils/install.packages.html">install.packages</a></span><span class="op">(</span><span class="st">"genefilter"</span><span class="op">)</span></span>
 <span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">genefilter</span><span class="op">)</span></span>
 <span><span class="va">tt</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/genefilter/man/rowFtests.html">colttests</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">y</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -829,27 +730,12 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 <p>7. Advanced. Re-do the cross validation but this time include the selection step in the cross validation. The accuracy should now be close to 50%.</p>
 <p>8. Load the <code>tissue_gene_expression</code> dataset. Use the <code>train</code> function to predict tissue from gene expression. Use kNN. What <code>k</code> works best?</p>
 <p>9. The <code>createResample</code> function can be used to create bootstrap samples. For example, we can create 10 bootstrap samples for the <code>mnist_27</code> dataset like this:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-31_123d8b222b74be5e7c63b94fc2bbcae9">
-<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1995</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-21_434ed5eb4810e20a7afa21a5009795ff">
+<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1995</span><span class="op">)</span></span>
 <span><span class="va">indexes</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/createDataPartition.html">createResample</a></span><span class="op">(</span><span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">y</span>, <span class="fl">10</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>How many times do <code>3</code>, <code>4</code>, and <code>7</code> appear in the first re-sampled index?</p>
-<p>10. We see that some numbers appear more than once and others appear no times. This has to be this way for each dataset to be independent. Repeat the exercise for all the re-sampled indexes.</p>
-<p>11. Generate a random dataset like this:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-32_5423bdafe26c4fd6e4a8e5863f10cd87">
-<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="fl">100</span>, <span class="fl">0</span>, <span class="fl">1</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Estimate the 75th quantile, which we know is:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-33_af6b90cbebae74aa58e590d4df9f708b">
-<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">qnorm</a></span><span class="op">(</span><span class="fl">0.75</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>with the sample quantile:</p>
-<div class="cell" data-layout-align="center" data-hash="cross-validation_cache/html/unnamed-chunk-34_de6b1644390c4ac7b3af8fd1685eb08d">
-<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/quantile.html">quantile</a></span><span class="op">(</span><span class="va">y</span>, <span class="fl">0.75</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Run a Monte Carlo simulation to learn the expected value and standard error of this random variable.</p>
-<p>12. In practice, we can’t run a Monte Carlo simulation because we don’t know if <code>rnorm</code> is being used to simulate the data. Use the bootstrap to estimate the standard error using just the initial sample <code>y</code>. Use 10 bootstrap samples.</p>
-<p>13. Redo exercise 12, but with 10,000 bootstrap samples.</p>
+<p>10. We see that some numbers appear more than once and others appear no times. This must be so for each dataset to be independent. Repeat the exercise for all the re-sampled indexes.</p>
 
 
 </section></main><!-- /main --><script id="quarto-html-after-body" type="application/javascript">
@@ -1086,12 +972,12 @@ <h1 class="title"><span id="sec-cross-validation" class="quarto-section-identifi
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../ml/smoothing.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../ml/algorithms.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/ml/cross-validation_files/figure-html/accuracy-vs-k-knn-1.png b/docs/ml/cross-validation_files/figure-html/accuracy-vs-k-knn-1.png
index e39ef0b..419e17b 100644
Binary files a/docs/ml/cross-validation_files/figure-html/accuracy-vs-k-knn-1.png and b/docs/ml/cross-validation_files/figure-html/accuracy-vs-k-knn-1.png differ
diff --git a/docs/ml/cross-validation_files/figure-html/income-distribution-1.png b/docs/ml/cross-validation_files/figure-html/income-distribution-1.png
deleted file mode 100644
index ac68762..0000000
Binary files a/docs/ml/cross-validation_files/figure-html/income-distribution-1.png and /dev/null differ
diff --git a/docs/ml/cross-validation_files/figure-html/knn-1-overfit-1.png b/docs/ml/cross-validation_files/figure-html/knn-1-overfit-1.png
index 75cf0fa..e326f3d 100644
Binary files a/docs/ml/cross-validation_files/figure-html/knn-1-overfit-1.png and b/docs/ml/cross-validation_files/figure-html/knn-1-overfit-1.png differ
diff --git a/docs/ml/cross-validation_files/figure-html/knn-fit-1.png b/docs/ml/cross-validation_files/figure-html/knn-fit-1.png
index 6ce5920..4b3c861 100644
Binary files a/docs/ml/cross-validation_files/figure-html/knn-fit-1.png and b/docs/ml/cross-validation_files/figure-html/knn-fit-1.png differ
diff --git a/docs/ml/cross-validation_files/figure-html/median-is-normal-1.png b/docs/ml/cross-validation_files/figure-html/median-is-normal-1.png
deleted file mode 100644
index 15773b7..0000000
Binary files a/docs/ml/cross-validation_files/figure-html/median-is-normal-1.png and /dev/null differ
diff --git a/docs/ml/cross-validation_files/figure-html/mnist-27-data-1.png b/docs/ml/cross-validation_files/figure-html/mnist-27-data-1.png
deleted file mode 100644
index 1ae5d12..0000000
Binary files a/docs/ml/cross-validation_files/figure-html/mnist-27-data-1.png and /dev/null differ
diff --git a/docs/ml/cross-validation_files/figure-html/mnist-27-glm-est-1.png b/docs/ml/cross-validation_files/figure-html/mnist-27-glm-est-1.png
index 8af7d2b..0e9d7cd 100644
Binary files a/docs/ml/cross-validation_files/figure-html/mnist-27-glm-est-1.png and b/docs/ml/cross-validation_files/figure-html/mnist-27-glm-est-1.png differ
diff --git a/docs/ml/evaluation-metrics.html b/docs/ml/evaluation-metrics.html
index e1561af..9348a6f 100644
--- a/docs/ml/evaluation-metrics.html
+++ b/docs/ml/evaluation-metrics.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 25&nbsp; Evaluation metrics</title>
+<title>Advanced Data Science - 26&nbsp; Evaluation metrics</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -99,7 +99,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/evaluation-metrics.html"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/evaluation-metrics.html"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -224,23 +224,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -257,37 +263,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -304,31 +310,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -345,49 +351,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -400,22 +406,20 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#sec-training-test" id="toc-sec-training-test" class="nav-link active" data-scroll-target="#sec-training-test"><span class="header-section-number">25.1</span> Training and test sets</a></li>
-  <li><a href="#overall-accuracy" id="toc-overall-accuracy" class="nav-link" data-scroll-target="#overall-accuracy"><span class="header-section-number">25.2</span> Overall accuracy</a></li>
-  <li><a href="#the-confusion-matrix" id="toc-the-confusion-matrix" class="nav-link" data-scroll-target="#the-confusion-matrix"><span class="header-section-number">25.3</span> The confusion matrix</a></li>
-  <li><a href="#sensitivity-and-specificity" id="toc-sensitivity-and-specificity" class="nav-link" data-scroll-target="#sensitivity-and-specificity"><span class="header-section-number">25.4</span> Sensitivity and specificity</a></li>
-  <li><a href="#balanced-accuracy-and-f_1-score" id="toc-balanced-accuracy-and-f_1-score" class="nav-link" data-scroll-target="#balanced-accuracy-and-f_1-score"><span class="header-section-number">25.5</span> Balanced accuracy and <span class="math inline">\(F_1\)</span> score</a></li>
-  <li><a href="#prevalence-matters-in-practice" id="toc-prevalence-matters-in-practice" class="nav-link" data-scroll-target="#prevalence-matters-in-practice"><span class="header-section-number">25.6</span> Prevalence matters in practice</a></li>
-  <li><a href="#roc-and-precision-recall-curves" id="toc-roc-and-precision-recall-curves" class="nav-link" data-scroll-target="#roc-and-precision-recall-curves"><span class="header-section-number">25.7</span> ROC and precision-recall curves</a></li>
-  <li><a href="#sec-loss-function" id="toc-sec-loss-function" class="nav-link" data-scroll-target="#sec-loss-function"><span class="header-section-number">25.8</span> The loss function</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">25.9</span> Exercises</a></li>
+<li><a href="#sec-training-test" id="toc-sec-training-test" class="nav-link active" data-scroll-target="#sec-training-test"><span class="header-section-number">26.1</span> Training and test sets</a></li>
+  <li><a href="#overall-accuracy" id="toc-overall-accuracy" class="nav-link" data-scroll-target="#overall-accuracy"><span class="header-section-number">26.2</span> Overall accuracy</a></li>
+  <li><a href="#the-confusion-matrix" id="toc-the-confusion-matrix" class="nav-link" data-scroll-target="#the-confusion-matrix"><span class="header-section-number">26.3</span> The confusion matrix</a></li>
+  <li><a href="#sec-senistivity-and-specificity" id="toc-sec-senistivity-and-specificity" class="nav-link" data-scroll-target="#sec-senistivity-and-specificity"><span class="header-section-number">26.4</span> Sensitivity and specificity</a></li>
+  <li><a href="#balanced-accuracy-and-f_1-score" id="toc-balanced-accuracy-and-f_1-score" class="nav-link" data-scroll-target="#balanced-accuracy-and-f_1-score"><span class="header-section-number">26.5</span> Balanced accuracy and <span class="math inline">\(F_1\)</span> score</a></li>
+  <li><a href="#prevalence-matters-in-practice" id="toc-prevalence-matters-in-practice" class="nav-link" data-scroll-target="#prevalence-matters-in-practice"><span class="header-section-number">26.6</span> Prevalence matters in practice</a></li>
+  <li><a href="#roc-and-precision-recall-curves" id="toc-roc-and-precision-recall-curves" class="nav-link" data-scroll-target="#roc-and-precision-recall-curves"><span class="header-section-number">26.7</span> ROC and precision-recall curves</a></li>
+  <li><a href="#sec-mse" id="toc-sec-mse" class="nav-link" data-scroll-target="#sec-mse"><span class="header-section-number">26.8</span> Mean Squared Error</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">26.9</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/ml/evaluation-metrics.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
-<h1 class="title">
-<span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span>
-</h1>
+<h1 class="title"><span id="sec-evaluation-metrics" class="quarto-section-identifier"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></h1>
 </div>
 
 
@@ -429,10 +433,10 @@ <h1 class="title">
   
 
 </header><p>Before we start describing approaches to optimize the way we build algorithms, we first need to define what we mean when we say one approach is better than another. In this section, we focus on describing ways in which machine learning algorithms are evaluated. Specifically, we need to quantify what we mean by “better”.</p>
-<p>For our first introduction to machine learning concepts, we will start with a boring and simple example: how to predict sex using height. As we explain machine learning step by step, this example will let us set down the first building block. Soon enough, we will be attacking more interesting challenges. We use the <strong>caret</strong> package, which has several useful functions for building and assessing machine learning methods and we introduce in more detail in Section <a href="ml-in-practice.html#sec-caret"><span>Section&nbsp;30.1</span></a>, and for a first example, we use the height data in dslabs.</p>
+<p>For our first introduction to machine learning concepts, we will start with a boring and simple example: how to predict sex using height. As we explain how to build a prediction algorithm with this example, we will start to set down the first building block needed to understand machine learning. Soon enough, we will be undertaking more interesting challenges.</p>
+<p>We introduce the <strong>caret</strong> package, which provides useful functions to facilitate machine learning in R, and we describe it in more detail in <a href="ml-in-practice.html#sec-caret"><span>Section&nbsp;31.1</span></a>. For our first example, we use the height data provided by the <strong>dslabs</strong> package.</p>
 <div class="cell" data-layout-align="center">
-<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
-<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span>
+<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span>
 <span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We start by defining the outcome and predictors.</p>
@@ -441,22 +445,22 @@ <h1 class="title">
 <span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">heights</span><span class="op">$</span><span class="va">height</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>In this case, we have only one predictor, height, and <code>y</code> is clearly a categorical outcome since observed values are either <code>Male</code> or <code>Female</code>. We know that we will not be able to predict <span class="math inline">\(Y\)</span> very accurately based on <span class="math inline">\(X\)</span> because male and female average heights are not that different relative to within group variability. But can we do better than guessing? To answer this question, we need a quantitative definition of better.</p>
-<section id="sec-training-test" class="level2" data-number="25.1"><h2 data-number="25.1" class="anchored" data-anchor-id="sec-training-test">
-<span class="header-section-number">25.1</span> Training and test sets</h2>
-<p>Ultimately, a machine learning algorithm is evaluated on how it performs in the real world with completely new datasets. However, when developing an algorithm, we usually have a dataset for which we know the outcomes, as we do with the heights: we know the sex of every student in our dataset. Therefore, to mimic the ultimate evaluation process, we typically split the data into two parts and act as if we don’t know the outcome for one of these. We stop pretending we don’t know the outcome to evaluate the algorithm, but only <em>after</em> we are done constructing it. We refer to the group for which we know the outcome, and use to develop the algorithm, as the <em>training</em> set. We refer to the group for which we pretend we don’t know the outcome as the <em>test</em> set.</p>
-<p>A standard way of generating the training and test sets is by randomly splitting the data. The <strong>caret</strong> package includes the function <code>createDataPartition</code> that helps us generates indexes for randomly splitting the data into training and test sets:</p>
+<section id="sec-training-test" class="level2" data-number="26.1"><h2 data-number="26.1" class="anchored" data-anchor-id="sec-training-test">
+<span class="header-section-number">26.1</span> Training and test sets</h2>
+<p>Ultimately, a machine learning algorithm is evaluated on how it performs in the real world with completely new datasets. However, when developing an algorithm, we usually have a dataset for which we know the outcomes, as we do with the heights: we know the sex of every student in our dataset. Therefore, to mimic the ultimate evaluation process, we typically split the data into two parts and act as if we don’t know the outcome for one of these. We stop pretending we don’t know the outcome to evaluate the algorithm, but only <em>after</em> we are done constructing it. We refer to the group for which we know the outcome, and that we use to develop the algorithm, as the <em>training</em> set. We refer to the group for which we pretend we don’t know the outcome as the <em>test</em> set.</p>
+<p>A standard way of generating the training and test sets is by randomly splitting the data. The <strong>caret</strong> package includes the function <code>createDataPartition</code> that helps us generate indexes for randomly splitting the data into training and test sets:</p>
 <div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-3_c7986ce47e5dc4cf646e8c28a84c9bdf">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">2007</span><span class="op">)</span></span>
 <span><span class="va">test_index</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/createDataPartition.html">createDataPartition</a></span><span class="op">(</span><span class="va">y</span>, times <span class="op">=</span> <span class="fl">1</span>, p <span class="op">=</span> <span class="fl">0.5</span>, list <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The argument <code>times</code> is used to define how many random samples of indexes to return, the argument <code>p</code> is used to define what proportion of the data is represented by the index, and the argument <code>list</code> is used to decide if we want the indexes returned as a list or not. We can use the result of the <code>createDataPartition</code> function call to define the training and test sets like this:</p>
+<p>The argument <code>times</code> is used to define how many random samples of indexes to return, the argument <code>p</code> is used to define what proportion of the data is represented by the index, and the argument <code>list</code> is used to decide if we want the indexes returned as a list or not. We can use the result of the <code>createDataPartition</code> function call to define the training and test sets as follows:</p>
 <div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-4_dbff2fe9a539cc28b674d921bd54b335">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">test_set</span> <span class="op">&lt;-</span> <span class="va">heights</span><span class="op">[</span><span class="va">test_index</span>, <span class="op">]</span></span>
 <span><span class="va">train_set</span> <span class="op">&lt;-</span> <span class="va">heights</span><span class="op">[</span><span class="op">-</span><span class="va">test_index</span>, <span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We will now develop an algorithm using <strong>only</strong> the training set. Once we are done developing the algorithm, we will <em>freeze</em> it and evaluate it using the test set. The simplest way to evaluate the algorithm when the outcomes are categorical is by simply reporting the proportion of cases that were correctly predicted <strong>in the test set</strong>. This metric is usually referred to as <em>overall accuracy</em>.</p>
-</section><section id="overall-accuracy" class="level2" data-number="25.2"><h2 data-number="25.2" class="anchored" data-anchor-id="overall-accuracy">
-<span class="header-section-number">25.2</span> Overall accuracy</h2>
+</section><section id="overall-accuracy" class="level2" data-number="26.2"><h2 data-number="26.2" class="anchored" data-anchor-id="overall-accuracy">
+<span class="header-section-number">26.2</span> Overall accuracy</h2>
 <p>To demonstrate the use of overall accuracy, we will build two competing algorithms and compare them.</p>
 <p>Let’s start by developing the simplest possible machine algorithm: guessing the outcome.</p>
 <div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-5_4fae38536198248eda052d2dfb7c4f14">
@@ -475,18 +479,18 @@ <h1 class="title">
 </div>
 <p>Not surprisingly, our accuracy is about 50%. We are guessing!</p>
 <p>Can we do better? Exploratory data analysis suggests we can because, on average, males are slightly taller than females:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-8_13462f8fade846313db788f3e731f85b">
-<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">sex</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-8_da09a3f776ba039998b73bbcfa824442">
+<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
+<span><span class="va">heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">sex</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>avg <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span>, sd <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; # A tibble: 2 × 3</span></span>
-<span><span class="co">#&gt;   sex    `mean(height)` `sd(height)`</span></span>
-<span><span class="co">#&gt;   &lt;fct&gt;           &lt;dbl&gt;        &lt;dbl&gt;</span></span>
-<span><span class="co">#&gt; 1 Female           64.9         3.76</span></span>
-<span><span class="co">#&gt; 2 Male             69.3         3.61</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt;   sex      avg    sd</span></span>
+<span><span class="co">#&gt;   &lt;fct&gt;  &lt;dbl&gt; &lt;dbl&gt;</span></span>
+<span><span class="co">#&gt; 1 Female  64.9  3.76</span></span>
+<span><span class="co">#&gt; 2 Male    69.3  3.61</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>But how do we make use of this insight? Let’s try another simple approach: predict <code>Male</code> if height is within two standard deviations from the average male:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-9_bd73ade6821a13e92b0ec4a02370c728">
-<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">x</span> <span class="op">&gt;</span> <span class="fl">62</span>, <span class="st">"Male"</span>, <span class="st">"Female"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span>levels <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/levels.html">levels</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>But how do we make use of this insight? Let’s try another simple approach: predict <code>Male</code> if height is within two standard deviations from the average male.</p>
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-9_c210f02c1ec0c8e3028b9b8f08e9dc61">
+<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">x</span> <span class="op">&gt;</span> <span class="fl">62</span>, <span class="st">"Male"</span>, <span class="st">"Female"</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/levels.html">levels</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The accuracy goes up from 0.50 to about 0.80:</p>
 <div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-10_08892e0c93b400e4725c66c6553967ce">
@@ -495,11 +499,10 @@ <h1 class="title">
 </div>
 <p>But can we do even better? In the example above, we used a cutoff of 62, but we can examine the accuracy obtained for other cutoffs and then pick the value that provides the best results. But remember, <strong>it is important that we optimize the cutoff using only the training set</strong>: the test set is only for evaluation. Although for this simplistic example it is not much of a problem, later we will learn that evaluating an algorithm on the training set can lead to <em>overfitting</em>, which often results in dangerously over-optimistic assessments.</p>
 <p>Here we examine the accuracy of 10 different cutoffs and pick the one yielding the best result:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-11_76f20b90ec9d6b4bf182b043b2ec0bec">
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-11_bd4d1ded893f7969031cbabcc746edd2">
 <div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">cutoff</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">61</span>, <span class="fl">70</span><span class="op">)</span></span>
-<span><span class="va">accuracy</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://purrr.tidyverse.org/reference/map.html">map_dbl</a></span><span class="op">(</span><span class="va">cutoff</span>, <span class="kw">function</span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">{</span></span>
-<span>  <span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">train_set</span><span class="op">$</span><span class="va">height</span> <span class="op">&gt;</span> <span class="va">x</span>, <span class="st">"Male"</span>, <span class="st">"Female"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>    <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span>levels <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/levels.html">levels</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">accuracy</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">cutoff</span>, <span class="kw">function</span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">{</span></span>
+<span>  <span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">train_set</span><span class="op">$</span><span class="va">height</span> <span class="op">&gt;</span> <span class="va">x</span>, <span class="st">"Male"</span>, <span class="st">"Female"</span><span class="op">)</span>, levels <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/levels.html">levels</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">y_hat</span> <span class="op">==</span> <span class="va">train_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span></span>
 <span><span class="op">}</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
@@ -532,39 +535,36 @@ <h1 class="title">
 <span><span class="co">#&gt; [1] 0.804</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We see that it is a bit lower than the accuracy observed for the training set, but it is still better than guessing. And by testing on a dataset that we did not train on, we know our result is not due to cherry-picking a good result.</p>
-</section><section id="the-confusion-matrix" class="level2" data-number="25.3"><h2 data-number="25.3" class="anchored" data-anchor-id="the-confusion-matrix">
-<span class="header-section-number">25.3</span> The confusion matrix</h2>
+</section><section id="the-confusion-matrix" class="level2" data-number="26.3"><h2 data-number="26.3" class="anchored" data-anchor-id="the-confusion-matrix">
+<span class="header-section-number">26.3</span> The confusion matrix</h2>
 <p>The prediction rule we developed in the previous section predicts <code>Male</code> if the student is taller than 64 inches. Given that the average female is about 64 inches, this prediction rule seems wrong. What happened? If a student is the height of the average female, shouldn’t we predict <code>Female</code>?</p>
-<p>Generally speaking, overall accuracy can be a deceptive measure. To see this, we will start by constructing what is referred to as the <em>confusion matrix</em>, which basically tabulates each combination of prediction and actual value. We can do this in R using the function <code>table</code>:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-15_0932e24140a8dae39b221534e3e3a509">
-<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/table.html">table</a></span><span class="op">(</span>predicted <span class="op">=</span> <span class="va">y_hat</span>, actual <span class="op">=</span> <span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span></span>
-<span><span class="co">#&gt;          actual</span></span>
-<span><span class="co">#&gt; predicted Female Male</span></span>
-<span><span class="co">#&gt;    Female     48   32</span></span>
-<span><span class="co">#&gt;    Male       71  374</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Generally speaking, overall accuracy can be a deceptive measure. To see this, we will start by constructing what is referred to as the <em>confusion matrix</em>, which basically tabulates each combination of prediction and actual value. We can do this in R simply using <code>table(predicted = y_hat, actual = test_set$sex)</code>,</p>
+<p>but the <code>confusionMatrix</code> <strong>caret</strong> package computes the confusion matrix and much more:</p>
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-15_404a423291cc038121bb201ffdbdb267">
+<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">cm</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span>data <span class="op">=</span> <span class="va">y_hat</span>, reference <span class="op">=</span> <span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span></span>
+<span><span class="va">cm</span><span class="op">$</span><span class="va">table</span></span>
+<span><span class="co">#&gt;           Reference</span></span>
+<span><span class="co">#&gt; Prediction Female Male</span></span>
+<span><span class="co">#&gt;     Female     48   32</span></span>
+<span><span class="co">#&gt;     Male       71  374</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>If we study this table closely, it reveals a problem. If we compute the accuracy separately for each sex, we get:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-16_e1f00ce4604a1955d303930038573734">
-<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">test_set</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>y_hat <span class="op">=</span> <span class="va">y_hat</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/group_by.html">group_by</a></span><span class="op">(</span><span class="va">sex</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html">summarize</a></span><span class="op">(</span>accuracy <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">y_hat</span> <span class="op">==</span> <span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="co">#&gt; # A tibble: 2 × 2</span></span>
-<span><span class="co">#&gt;   sex    accuracy</span></span>
-<span><span class="co">#&gt;   &lt;fct&gt;     &lt;dbl&gt;</span></span>
-<span><span class="co">#&gt; 1 Female    0.403</span></span>
-<span><span class="co">#&gt; 2 Male      0.921</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-16_42f1d47bd1beaf7fce9450c72a02ac00">
+<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">cm</span><span class="op">$</span><span class="va">byClass</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Sensitivity"</span>, <span class="st">"Specificity"</span><span class="op">)</span><span class="op">]</span></span>
+<span><span class="co">#&gt; Sensitivity Specificity </span></span>
+<span><span class="co">#&gt;       0.403       0.921</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>There is an imbalance in the accuracy for males and females: too many females are predicted to be male. We are calling almost half of the females male! How can our overall accuracy be so high then? This is because the <em>prevalence</em> of males in this dataset is high. These heights were collected from three data sciences courses, two of which had more males enrolled:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-17_51c19f35c0ffe23f25defaabcc703ebe">
-<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">prev</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">y</span> <span class="op">==</span> <span class="st">"Male"</span><span class="op">)</span></span>
-<span><span class="va">prev</span></span>
-<span><span class="co">#&gt; [1] 0.773</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>In the next section, we explain that these two are equivalent to accuracy with females and males, respectively.</p>
+<p>We notice an imbalance: too many females are predicted to be male. We are calling almost half of the females male! How can our overall accuracy be so high then? This is because the <em>prevalence</em> of males in this dataset is high. These heights were collected from three data sciences courses, two of which had higher male enrollment:</p>
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-17_40d551608d9b83c915e3fa55efb6ec5f">
+<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">cm</span><span class="op">$</span><span class="va">byClass</span><span class="op">[</span><span class="st">"Prevalence"</span><span class="op">]</span></span>
+<span><span class="co">#&gt; Prevalence </span></span>
+<span><span class="co">#&gt;      0.227</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>So when computing overall accuracy, the high percentage of mistakes made for females is outweighed by the gains in correct calls for men. <strong>This can actually be a big problem in machine learning.</strong> If your training data is biased in some way, you are likely to develop algorithms that are biased as well. The fact that we used a test set does not matter because it is also derived from the original biased dataset. This is one of the reasons we look at metrics other than overall accuracy when evaluating a machine learning algorithm.</p>
+<p>So when computing overall accuracy, the high percentage of mistakes made for females is outweighed by the gains in correct calls for men. This type of bias can actually be a big problem in practice. If your training data is biased in some way, you are likely to develop algorithms that are biased as well. The fact that we used a test set does not matter because it is also derived from the original biased dataset. This is one of the reasons we look at metrics other than overall accuracy when evaluating a machine learning algorithm.</p>
 <p>There are several metrics that we can use to evaluate an algorithm in a way that prevalence does not cloud our assessment, and these can all be derived from the confusion matrix. A general improvement to using overall accuracy is to study <em>sensitivity</em> and <em>specificity</em> separately.</p>
-</section><section id="sensitivity-and-specificity" class="level2" data-number="25.4"><h2 data-number="25.4" class="anchored" data-anchor-id="sensitivity-and-specificity">
-<span class="header-section-number">25.4</span> Sensitivity and specificity</h2>
+</section><section id="sec-senistivity-and-specificity" class="level2" data-number="26.4"><h2 data-number="26.4" class="anchored" data-anchor-id="sec-senistivity-and-specificity">
+<span class="header-section-number">26.4</span> Sensitivity and specificity</h2>
 <p>To define sensitivity and specificity, we need a binary outcome. When the outcomes are categorical, we can define these terms for a specific category. In the digits example, we can ask for the specificity in the case of correctly predicting 2 as opposed to some other digit. Once we specify a category of interest, then we can talk about positive outcomes, <span class="math inline">\(Y=1\)</span>, and negative outcomes, <span class="math inline">\(Y=0\)</span>.</p>
 <p>In general, <em>sensitivity</em> is defined as the ability of an algorithm to predict a positive outcome when the actual outcome is positive: <span class="math inline">\(\hat{Y}=1\)</span> when <span class="math inline">\(Y=1\)</span>. Because an algorithm that calls everything positive (<span class="math inline">\(\hat{Y}=1\)</span> no matter what) has perfect sensitivity, this metric on its own is not enough to judge an algorithm. For this reason, we also examine <em>specificity</em>, which is generally defined as the ability of an algorithm to not predict a positive <span class="math inline">\(\hat{Y}=0\)</span> when the actual outcome is not a positive <span class="math inline">\(Y=0\)</span>. We can summarize in the following way:</p>
 <ul>
@@ -643,23 +643,20 @@ <h1 class="title">
 </tr>
 </tbody>
 </table>
-<p>Here TPR is True Positive Rate, FPR is False Positive Rate, and PPV is Positive Predictive Value. The <strong>caret</strong> function <code>confusionMatrix</code> computes all these metrics for us once we define what category “positive” is. The function expects factors as input, and the first level is considered the positive outcome or <span class="math inline">\(Y=1\)</span>. In our example, <code>Female</code> is the first level because it comes before <code>Male</code> alphabetically. If you type this into R you will see several metrics including accuracy, sensitivity, specificity, and PPV.</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-19_6561b69adb12922bad33a5c830e66104">
-<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">cm</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span>data <span class="op">=</span> <span class="va">y_hat</span>, reference <span class="op">=</span> <span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>You can acceess these directly, for example, like this:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-20_22f6ae9560a38af422fa4da131f3388e">
-<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">cm</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
+<p>The <strong>caret</strong> function <code>confusionMatrix</code> computes all these metrics for us once we define which category is the “positive” (Y=1). The function expects factors as input, and the first level is considered the positive outcome or <span class="math inline">\(Y=1\)</span>. In our example, <code>Female</code> is the first level because it comes before <code>Male</code> alphabetically. If you type this into R, you will see several metrics including accuracy, sensitivity, specificity, and PPV.</p>
+<p>You can access these directly, for example, like this:</p>
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-19_067e6d1819a786a2e4966684c6d348ac">
+<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">cm</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
 <span><span class="co">#&gt; Accuracy </span></span>
 <span><span class="co">#&gt;    0.804</span></span>
 <span><span class="va">cm</span><span class="op">$</span><span class="va">byClass</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Sensitivity"</span>,<span class="st">"Specificity"</span>, <span class="st">"Prevalence"</span><span class="op">)</span><span class="op">]</span></span>
 <span><span class="co">#&gt; Sensitivity Specificity  Prevalence </span></span>
 <span><span class="co">#&gt;       0.403       0.921       0.227</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can see that the high overall accuracy is possible despite relatively low sensitivity. As we hinted at above, the reason this happens is because of the low prevalence (0.23): the proportion of females is low. Because prevalence is low, failing to predict actual females as females (low sensitivity) does not lower the accuracy as much as failing to predict actual males as males (low specificity). This is an example of why it is important to examine sensitivity and specificity and not just accuracy. Before applying this algorithm to general datasets, we need to ask ourselves if prevalence will be the same.</p>
-</section><section id="balanced-accuracy-and-f_1-score" class="level2" data-number="25.5"><h2 data-number="25.5" class="anchored" data-anchor-id="balanced-accuracy-and-f_1-score">
-<span class="header-section-number">25.5</span> Balanced accuracy and <span class="math inline">\(F_1\)</span> score</h2>
-<p>Although we usually recommend studying both specificity and sensitivity, very often it is useful to have a one-number summary, for example for optimization purposes. One metric that is preferred over overall accuracy is the average of specificity and sensitivity, referred to as <em>balanced accuracy</em>. Because specificity and sensitivity are rates, it is more appropriate to compute the <em>harmonic</em> average. In fact, the <em><span class="math inline">\(F_1\)</span>-score</em>, a widely used one-number summary, is the harmonic average of precision and recall:</p>
+<p>We can see that the high overall accuracy is possible despite relatively low sensitivity. As we hinted at above, the reason this happens is because of the low prevalence (0.23): the proportion of females is low. Because prevalence is low, failing to predict actual females as females (low sensitivity) does not lower the overall accuracy as much as failing to predict actual males as males (low specificity). This is an example of why it is important to examine sensitivity and specificity and not just accuracy. Before applying this algorithm to general datasets, we need to ask ourselves if prevalence will be the same.</p>
+</section><section id="balanced-accuracy-and-f_1-score" class="level2" data-number="26.5"><h2 data-number="26.5" class="anchored" data-anchor-id="balanced-accuracy-and-f_1-score">
+<span class="header-section-number">26.5</span> Balanced accuracy and <span class="math inline">\(F_1\)</span> score</h2>
+<p>Although we usually recommend studying both specificity and sensitivity, often it is useful to have a one-number summary, for example, for optimization purposes. One metric that is preferred over overall accuracy is the average of specificity and sensitivity, referred to as <em>balanced accuracy</em>. Because specificity and sensitivity are rates, it is more appropriate to compute the <em>harmonic</em> average. In fact, the <em><span class="math inline">\(F_1\)</span>-score</em>, a widely used one-number summary, is the harmonic average of precision and recall:</p>
 <p><span class="math display">\[
 \frac{1}{\frac{1}{2}\left(\frac{1}{\mbox{recall}} +
     \frac{1}{\mbox{precision}}\right) }
@@ -670,18 +667,17 @@ <h1 class="title">
 {\mbox{precision} + \mbox{recall}}
 \]</span></p>
 <p>when defining <span class="math inline">\(F_1\)</span>.</p>
-<p>Remember that, depending on the context, some types of errors are more costly than others. For example, in the case of plane safety, it is much more important to maximize sensitivity over specificity: failing to predict a plane will malfunction before it crashes is a much more costly error than grounding a plane when, in fact, the plane is in perfect condition. In a capital murder criminal case, the opposite is true since a false positive can lead to executing an innocent person. The <span class="math inline">\(F_1\)</span>-score can be adapted to weigh specificity and sensitivity differently. To do this, we define <span class="math inline">\(\beta\)</span> to represent how much more important sensitivity is compared to specificity and consider a weighted harmonic average:</p>
+<p>Remember that, depending on the context, some types of errors are more costly than others. For instance, in the case of plane safety, it is much more important to maximize sensitivity over specificity: failing to predict a plane will malfunction before it crashes is a much more costly error than grounding a plane when, in fact, the plane is in perfect condition. In a capital murder criminal case, the opposite is true since a false positive can lead to executing an innocent person. The <span class="math inline">\(F_1\)</span>-score can be adapted to weigh specificity and sensitivity differently. To do this, we define <span class="math inline">\(\beta\)</span> to represent how much more important sensitivity is compared to specificity and consider a weighted harmonic average:</p>
 <p><span class="math display">\[
 \frac{1}{\frac{\beta^2}{1+\beta^2}\frac{1}{\mbox{recall}} +
     \frac{1}{1+\beta^2}\frac{1}{\mbox{precision}} }
 \]</span></p>
 <p>The <code>F_meas</code> function in the <strong>caret</strong> package computes this summary with <code>beta</code> defaulting to 1.</p>
 <p>Let’s rebuild our prediction algorithm, but this time maximizing the F-score instead of overall accuracy:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-21_d95768cb4021ed79eac2c65df6cab2bc">
-<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">cutoff</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">61</span>, <span class="fl">70</span><span class="op">)</span></span>
-<span><span class="va">F_1</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://purrr.tidyverse.org/reference/map.html">map_dbl</a></span><span class="op">(</span><span class="va">cutoff</span>, <span class="kw">function</span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">{</span></span>
-<span>  <span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">train_set</span><span class="op">$</span><span class="va">height</span> <span class="op">&gt;</span> <span class="va">x</span>, <span class="st">"Male"</span>, <span class="st">"Female"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
-<span>    <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span>levels <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/levels.html">levels</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-20_7ed6c03288df18399d0e8010ae05468e">
+<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">cutoff</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">61</span>, <span class="fl">70</span><span class="op">)</span></span>
+<span><span class="va">F_1</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">cutoff</span>, <span class="kw">function</span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">{</span></span>
+<span>  <span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">train_set</span><span class="op">$</span><span class="va">height</span> <span class="op">&gt;</span> <span class="va">x</span>, <span class="st">"Male"</span>, <span class="st">"Female"</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/levels.html">levels</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
 <span>  <span class="fu"><a href="https://rdrr.io/pkg/caret/man/recall.html">F_meas</a></span><span class="op">(</span>data <span class="op">=</span> <span class="va">y_hat</span>, reference <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">train_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="op">}</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
@@ -695,36 +691,46 @@ <h1 class="title">
 </div>
 </div>
 <p>We see that it is maximized at <span class="math inline">\(F_1\)</span> value of:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-22_608dcfb599e77fa5fb6ebcca772296d2">
-<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="va">F_1</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-21_9cd0724320f27c2a792c7cdea5568094">
+<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="va">F_1</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.647</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>This maximum is achieved when we use the following cutoff:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-23_df8802ab72996ab6a85e6eb8cbf49b66">
-<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">best_cutoff</span> <span class="op">&lt;-</span> <span class="va">cutoff</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/which.min.html">which.max</a></span><span class="op">(</span><span class="va">F_1</span><span class="op">)</span><span class="op">]</span></span>
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-22_b22d3e1c718518e486b315ba74a15a58">
+<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">best_cutoff</span> <span class="op">&lt;-</span> <span class="va">cutoff</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/which.min.html">which.max</a></span><span class="op">(</span><span class="va">F_1</span><span class="op">)</span><span class="op">]</span></span>
 <span><span class="va">best_cutoff</span></span>
 <span><span class="co">#&gt; [1] 66</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>A cutoff of 66 makes more sense than 64. Furthermore, it balances the specificity and sensitivity of our confusion matrix:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-24_7ad59bb16b728d263505ef0e3545060e">
-<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">height</span> <span class="op">&gt;</span> <span class="va">best_cutoff</span>, <span class="st">"Male"</span>, <span class="st">"Female"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-23_34b8692b4aabca2e7d4e93ae80619ccf">
+<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">height</span> <span class="op">&gt;</span> <span class="va">best_cutoff</span>, <span class="st">"Male"</span>, <span class="st">"Female"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span>levels <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/levels.html">levels</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/sensitivity.html">sensitivity</a></span><span class="op">(</span>data <span class="op">=</span> <span class="va">y_hat</span>, reference <span class="op">=</span> <span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.63</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/sensitivity.html">specificity</a></span><span class="op">(</span>data <span class="op">=</span> <span class="va">y_hat</span>, reference <span class="op">=</span> <span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.833</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We now see that we do much better than guessing, that both sensitivity and specificity are relatively high, and that we have built our first machine learning algorithm. It takes height as a predictor and predicts female if you are 65 inches or shorter.</p>
-</section><section id="prevalence-matters-in-practice" class="level2" data-number="25.6"><h2 data-number="25.6" class="anchored" data-anchor-id="prevalence-matters-in-practice">
-<span class="header-section-number">25.6</span> Prevalence matters in practice</h2>
-<p>A machine learning algorithm with very high sensitivity and specificity may not be useful in practice when prevalence is close to either 0 or 1. To see this, consider the case of a doctor that specializes in a rare disease and is interested in developing an algorithm for predicting who has the disease. The doctor shares data with you and you then develop an algorithm with very high sensitivity. You explain that this means that if a patient has the disease, the algorithm is very likely to predict correctly. You also tell the doctor that you are also concerned because, based on the dataset you analyzed, 1/2 the patients have the disease: <span class="math inline">\(\mbox{Pr}(\hat{Y}=1)\)</span>. The doctor is neither concerned nor impressed and explains that what is important is the precision of the test: <span class="math inline">\(\mbox{Pr}(Y=1 | \hat{Y}=1)\)</span>. Using Bayes theorem, we can connect the two measures:</p>
-<p><span class="math display">\[ \mbox{Pr}(Y = 1\mid \hat{Y}=1) = \mbox{Pr}(\hat{Y}=1 \mid Y=1) \frac{\mbox{Pr}(Y=1)}{\mbox{Pr}(\hat{Y}=1)}\]</span></p>
-<p>The doctor knows that the prevalence of the disease is 5 in 1,000, which implies that <span class="math inline">\(\mbox{Pr}(Y=1) \, / \,\mbox{Pr}(\hat{Y}=1) = 1/100\)</span> and therefore the precision of your algorithm is less than 0.01. The doctor does not have much use for your algorithm.</p>
-</section><section id="roc-and-precision-recall-curves" class="level2" data-number="25.7"><h2 data-number="25.7" class="anchored" data-anchor-id="roc-and-precision-recall-curves">
-<span class="header-section-number">25.7</span> ROC and precision-recall curves</h2>
-<p>When comparing the two methods (guessing versus using a height cutoff), we looked at accuracy and <span class="math inline">\(F_1\)</span>. The second method clearly outperformed the first. However, while we considered several cutoffs for the second method, for the first we only considered one approach: guessing with equal probability. Note that guessing <code>Male</code> with higher probability would give us higher accuracy due to the bias in the sample:</p>
+<p>We now see that we do much better than guessing, that both sensitivity and specificity are relatively high.</p>
+</section><section id="prevalence-matters-in-practice" class="level2" data-number="26.6"><h2 data-number="26.6" class="anchored" data-anchor-id="prevalence-matters-in-practice">
+<span class="header-section-number">26.6</span> Prevalence matters in practice</h2>
+<p>A machine learning algorithm with very high TPR and TNR may not be useful in practice when prevalence is close to either 0 or 1. To see this, consider the case of a doctor that specializes in a rare disease and is interested in developing an algorithm for predicting who has the disease.</p>
+<p>The doctor shares data with about 1/2 cases and 1/2 controls and some predictors. You then develop an algorithm with TPR=0.99 and TNR = 0.99. You are excited to explain to the doctor that this means that if a patient has the disease, the algorithm is very likely to predict correctly. The doctor is not impressed and explains that your TNR is too low for this algorithm to be used in practice. This is because this is a rare disease with a prevalence in the general population of 0.5%. The doctor reminds you of Bayes formula:</p>
+<p><span class="math display">\[ \mbox{Pr}(Y = 1\mid \hat{Y}=1) = \mbox{Pr}(\hat{Y}=1 \mid Y=1) \frac{\mbox{Pr}(Y=1)}{\mbox{Pr}(\hat{Y}=1)} \implies \text{Precision} = \text{TPR} \times \frac{\text{Prevalence}}{\text{TPR}\times \text{Prevalence} + \text{FPR}\times(1-\text{Prevalence})} \approx 0.33  \]</span></p>
+<p>Here is plot of precision as a function of prevalence with TPR and TNR are 95%:</p>
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-24_e0963f93c2675f288ac94fb57e689b17">
+<div class="cell-output-display">
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure"><p><img src="evaluation-metrics_files/figure-html/unnamed-chunk-24-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
+</figure>
+</div>
+</div>
+</div>
+<p>Although your algorithm has a precision of about 95% on the data you train on, with prevalence of 50%, if applied to the general population, the algorithm’s precision would be just 33%. The doctor can’t use an algorithm with 33% of people receiving a positive test actually not having the disease. Note that even if your algorithm had perfect sensitivity, the precision would still be around 33%. So you need to greatly decrease your FPR for the algorithm to be useful in practice.</p>
+</section><section id="roc-and-precision-recall-curves" class="level2" data-number="26.7"><h2 data-number="26.7" class="anchored" data-anchor-id="roc-and-precision-recall-curves">
+<span class="header-section-number">26.7</span> ROC and precision-recall curves</h2>
+<p>When comparing the two methods (guessing versus using a height cutoff), we looked at accuracy and <span class="math inline">\(F_1\)</span>. The second method clearly outperformed the first. However, while we considered several cutoffs for the second method, for the first we only considered one approach: guessing with equal probability. Be aware that guessing <code>Male</code> with higher probability would give us higher accuracy due to the bias in the sample:</p>
 <div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-25_5a4498dbc69e22a176102fdf724d5a5f">
-<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fl">0.9</span></span>
+<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fl">0.9</span></span>
 <span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">test_index</span><span class="op">)</span></span>
 <span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Male"</span>, <span class="st">"Female"</span><span class="op">)</span>, <span class="va">n</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">p</span>, <span class="fl">1</span> <span class="op">-</span> <span class="va">p</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span>levels <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/levels.html">levels</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
@@ -734,20 +740,19 @@ <h1 class="title">
 <p>But, as described above, this would come at the cost of lower sensitivity. The curves we describe in this section will help us see this.</p>
 <p>Remember that for each of these parameters, we can get a different sensitivity and specificity. For this reason, a very common approach to evaluating methods is to compare them graphically by plotting both.</p>
 <p>A widely used plot that does this is the <em>receiver operating characteristic</em> (ROC) curve. If you are wondering where this name comes from, you can consult the ROC Wikipedia page<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>.</p>
-<p>The ROC curve plots sensitivity (TPR) versus 1 - specificity or the false positive rate (FPR). Here we compute the TPR and FPR needed for different probabilities of guessing male:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/roc-1_707a6007cf98d9a728ccef26d1d0f716">
-<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">probs</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span>, length.out <span class="op">=</span> <span class="fl">10</span><span class="op">)</span></span>
-<span><span class="va">guessing</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://purrr.tidyverse.org/reference/map_dfr.html">map_df</a></span><span class="op">(</span><span class="va">probs</span>, <span class="kw">function</span><span class="op">(</span><span class="va">p</span><span class="op">)</span><span class="op">{</span></span>
+<p>The ROC curve plots sensitivity, represented as the TPR, versus 1 - specificity represented as the false positive rate (FPR). Here we compute the TPR and FPR needed for different probabilities of guessing male:</p>
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/roc-1_d052b14e932d32eb3990fcc37907e3df">
+<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">probs</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">1</span>, length.out <span class="op">=</span> <span class="fl">10</span><span class="op">)</span></span>
+<span><span class="va">guessing</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">probs</span>, <span class="kw">function</span><span class="op">(</span><span class="va">p</span><span class="op">)</span><span class="op">{</span></span>
 <span>  <span class="va">y_hat</span> <span class="op">&lt;-</span> </span>
-<span>    <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Male"</span>, <span class="st">"Female"</span><span class="op">)</span>, <span class="va">n</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">p</span>, <span class="fl">1</span> <span class="op">-</span> <span class="va">p</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
+<span>    <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Male"</span>, <span class="st">"Female"</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">test_set</span><span class="op">)</span>, <span class="cn">TRUE</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">p</span>, <span class="fl">1</span> <span class="op">-</span> <span class="va">p</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>    <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span>levels <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Female"</span>, <span class="st">"Male"</span><span class="op">)</span><span class="op">)</span></span>
-<span>  <span class="fu"><a href="https://rdrr.io/r/base/list.html">list</a></span><span class="op">(</span>method <span class="op">=</span> <span class="st">"Guessing"</span>,</span>
-<span>       FPR <span class="op">=</span> <span class="fl">1</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/sensitivity.html">specificity</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span>,</span>
-<span>       TPR <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/sensitivity.html">sensitivity</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
+<span>  <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span>FPR <span class="op">=</span> <span class="fl">1</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/sensitivity.html">specificity</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span>,</span>
+<span>    TPR <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/sensitivity.html">sensitivity</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="va">test_set</span><span class="op">$</span><span class="va">sex</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="op">}</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We can use similar code to compute these values for our our second approach. By plotting both curves together, we are able to compare sensitivity for different values of specificity:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/roc-3_8817abbfeb8d64845f58822340a17826">
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/roc-3_ea3c16e9b3560397a93fcd4a959c2db4">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="evaluation-metrics_files/figure-html/roc-3-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -755,10 +760,10 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>We can see that we obtain higher sensitivity with this approach for all values of specificity, which implies it is in fact a better method. Note that ROC curves for guessing always fall on the identiy line. Also note that when making ROC curves, it is often nice to add the cutoff associated with each point.</p>
+<p>We see that we obtain higher sensitivity with this approach for all values of specificity, which implies it is in fact a better method. Keep in mind that ROC curves for guessing always fall on the identity line. Also, note that when making ROC curves, it is often nice to add the cutoff associated with each point.</p>
 <p>The packages <strong>pROC</strong> and <strong>plotROC</strong> are useful for generating these plots.</p>
 <p>ROC curves have one weakness and it is that neither of the measures plotted depends on prevalence. In cases in which prevalence matters, we may instead make a precision-recall plot. The idea is similar, but we instead plot precision against recall:</p>
-<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/precision-recall-1_b6cd6a10cd16957dbdebe248d5e6c67b">
+<div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/precision-recall-1_562b7f10f22aaf676b317e4df59d2d90">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="evaluation-metrics_files/figure-html/precision-recall-1-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -766,35 +771,40 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>From this plot we immediately see that the precision of guessing is not high. This is because the prevalence is low. We also see that if we change positives to mean Male instead of Female, the ROC curve remains the same, but the precision recall plot changes.</p>
-</section><section id="sec-loss-function" class="level2" data-number="25.8"><h2 data-number="25.8" class="anchored" data-anchor-id="sec-loss-function">
-<span class="header-section-number">25.8</span> The loss function</h2>
-<p>Up to now we have described evaluation metrics that apply exclusively to categorical data. Specifically, for binary outcomes, we have described how sensitivity, specificity, accuracy, and <span class="math inline">\(F_1\)</span> can be used as quantification. However, these metrics are not useful for continuous outcomes. In this section, we describe how the general approach to defining “best” in machine learning is to define a <em>loss function</em>, which can be applied to both categorical and continuous data.</p>
-<p>The most commonly used loss function is the squared loss function. If <span class="math inline">\(\hat{y}\)</span> is our predictor and <span class="math inline">\(y\)</span> is the observed outcome, the squared loss function is simply:</p>
-<p><span class="math display">\[
-(\hat{y} - y)^2
-\]</span></p>
-<p>Because we often have a test set with many observations, say <span class="math inline">\(N\)</span>, we use the mean squared error (MSE):</p>
-<p><span class="math display">\[
-\mbox{MSE} = \frac{1}{N} \mbox{RSS} = \frac{1}{N}\sum_{i=1}^N (\hat{y}_i - y_i)^2
-\]</span></p>
-<p>In practice, we often report the root mean squared error (RMSE), which is <span class="math inline">\(\sqrt{\mbox{MSE}}\)</span>, because it is in the same units as the outcomes. But doing the math is often easier with the MSE and it is therefore more commonly used in textbooks, since these usually describe theoretical properties of algorithms.</p>
-<p>If the outcomes are binary, both RMSE and MSE are equivalent to one minus accuracy, since <span class="math inline">\((\hat{y} - y)^2\)</span> is 0 if the prediction was correct and 1 otherwise. In general, our goal is to build an algorithm that minimizes the loss so it is as close to 0 as possible.</p>
-<p>Because our data is usually a random sample, we can think of the MSE as a random variable and the observed MSE can be thought of as an estimate of the expected MSE, which in mathematical notation we write like this:</p>
+<p>From the plot on the left, we immediately see that the precision of guessing is not high. This is because the prevalence is low. From the plot on the right, we also see that if we change <span class="math inline">\(Y=1\)</span> to mean <code>Male</code> instead of <code>Female</code>, the precision increases. Note that the ROC curve would remain the same.</p>
+</section><section id="sec-mse" class="level2" data-number="26.8"><h2 data-number="26.8" class="anchored" data-anchor-id="sec-mse">
+<span class="header-section-number">26.8</span> Mean Squared Error</h2>
+<p>Up to now we have described evaluation metrics that apply exclusively to categorical data. Specifically, for binary outcomes, we have described how sensitivity, specificity, accuracy, and <span class="math inline">\(F_1\)</span> can be used as quantification. However, these metrics are not useful for continuous outcomes.</p>
+<p>In this section, we describe how the general approach to defining “best” in machine learning is to define a <em>loss function</em>, which can be applied to both categorical and continuous data.</p>
+<p>The most commonly used loss function is the squared loss function. If <span class="math inline">\(\hat{y}\)</span> is our predictor and <span class="math inline">\(y\)</span> is the observed outcome, the squared loss function is simply: <span class="math inline">\((\hat{y} - y)^2\)</span>.</p>
+<p>Because we often model <span class="math inline">\(y\)</span> as the outcome of a random process, theoretically, it does not make sense to compare algorithms based on <span class="math inline">\((\hat{y} - y)^2\)</span> as the minimum can change from sample to sample. For this reason, we minimize mean squared error (MSE):</p>
 <p><span class="math display">\[
-\mbox{E}\left\{ \frac{1}{N}\sum_{i=1}^N (\hat{Y}_i - Y_i)^2 \right\}
+\text{MSE} \equiv \mbox{E}\{(\hat{Y} - Y)^2 \}
 \]</span></p>
-<p>This is a theoretical concept because in practice we only have one dataset to work with. But in theory, we think of having a very large number of random samples (call it <span class="math inline">\(B\)</span>), apply our algorithm to each, obtain an MSE for each random sample, and think of the expected MSE as:</p>
+<p>Consider that if the outcomes are binary, the MSE is equivalent to one minus expected accuracy, since <span class="math inline">\((\hat{y} - y)^2\)</span> is 0 if the prediction was correct and 1 otherwise.</p>
+<p>Different algorithms will result in different predictions <span class="math inline">\(\hat{Y}\)</span>, and therefore different MSE. In general, our goal is to build an algorithm that minimizes the loss so it is as close to 0 as possible.</p>
+<p>However, note that the MSE is a theoretical quantity. How do we estimate this? Because in practice we have tests set with many, say <span class="math inline">\(N\)</span>, independent observations, a commonly used observable estimate of the MSE is:</p>
 <p><span class="math display">\[
-\frac{1}{B} \sum_{b=1}^B \frac{1}{N}\sum_{i=1}^N \left(\hat{y}_i^b - y_i^b\right)^2
+\hat{\mbox{MSE}} = \frac{1}{N}\sum_{i=1}^N (\hat{y}_i - y_i)^2
 \]</span></p>
-<p>with <span class="math inline">\(y_{i}^b\)</span> denoting the <span class="math inline">\(i\)</span>th observation in the <span class="math inline">\(b\)</span>th random sample and <span class="math inline">\(\hat{y}_i^b\)</span> the resulting prediction obtained from applying the exact same algorithm to the <span class="math inline">\(b\)</span>th random sample. Again, in practice we only observe one random sample, so the expected MSE is only theoretical. However, in Chapter <a href="cross-validation.html"><span>Chapter&nbsp;28</span></a> we describe an approach to estimating the MSE that tries to mimic this theoretical quantity.</p>
-<p>Note that there are loss functions other than the squared loss. For example, the <em>Mean Absolute Error</em> uses absolute values, <span class="math inline">\(|\hat{Y}_i - Y_i|\)</span> instead of squaring the errors <span class="math inline">\((\hat{Y}_i - Y_i)^2\)</span>. However, in this book we focus on minimizing square loss since it is the most widely used.</p>
-</section><section id="exercises" class="level2" data-number="25.9"><h2 data-number="25.9" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">25.9</span> Exercises</h2>
-<p>The <code>reported_height</code> and <code>height</code> datasets were collected from three classes taught in the Departments of Computer Science and Biostatistics, as well as remotely through the Extension School. The biostatistics class was taught in 2016 along with an online version offered by the Extension School. On 2016-01-25 at 8:15 AM, during one of the lectures, the instructors asked students to fill in the sex and height questionnaire that populated the <code>reported_height</code> dataset. The online students filled the survey during the next few days, after the lecture was posted online. We can use this insight to define a variable, call it <code>type</code>, to denote the type of student: <code>inclass</code> or <code>online</code>:</p>
+<p>with the <span class="math inline">\(\hat{y}_i\)</span> generated completely independently from the the <span class="math inline">\(y_i\)</span>.</p>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>In practice, we often report the root mean squared error (RMSE), which is simply <span class="math inline">\(\sqrt{\mbox{MSE}}\)</span>, because it is in the same units as the outcomes.</p>
+</div>
+</div>
+</div>
+<p>However, the estimate <span class="math inline">\(\hat{\text{MSE}}\)</span> is a random variable. In fact, <span class="math inline">\(\text{MSE}\)</span> and <span class="math inline">\(\hat{\text{MSE}}\)</span> are often referred to as the true error and apparent error, respectively. Due to the complexity of some machine learning, it is difficult to derive the statistical properties of how well the apparent error estimates the true error. In <a href="cross-validation.html"><span>Chapter&nbsp;29</span></a>, we introduce cross-validation an approach to estimating the MSE.</p>
+<p>We end this chapter by pointing out that there are loss functions other than the squared loss. For example, the <em>Mean Absolute Error</em> uses absolute values, <span class="math inline">\(|\hat{Y}_i - Y_i|\)</span> instead of squaring the errors <span class="math inline">\((\hat{Y}_i - Y_i)^2\)</span>. However, in this book we focus on minimizing square loss since it is the most widely used.</p>
+</section><section id="exercises" class="level2" data-number="26.9"><h2 data-number="26.9" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">26.9</span> Exercises</h2>
+<p>The <code>reported_height</code> and <code>height</code> datasets were collected from three classes taught in the Departments of Computer Science and Biostatistics, as well as remotely through the Extension School. The Biostatistics class was taught in 2016 along with an online version offered by the Extension School. On 2016-01-25 at 8:15 AM, during one of the lectures, the instructors asked students to fill in the sex and height questionnaire that populated the <code>reported_height</code> dataset. The online students filled the survey during the next few days, after the lecture was posted online. We can use this insight to define a variable, call it <code>type</code>, to denote the type of student: <code>inclass</code> or <code>online</code>:</p>
 <div class="cell" data-layout-align="center" data-hash="evaluation-metrics_cache/html/unnamed-chunk-27_cd75decebddf1313a274c81dedf7a888">
-<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://lubridate.tidyverse.org">lubridate</a></span><span class="op">)</span></span>
+<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://lubridate.tidyverse.org">lubridate</a></span><span class="op">)</span></span>
 <span><span class="va">dat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span><span class="va">reported_heights</span>, date_time <span class="op">=</span> <span class="fu"><a href="https://lubridate.tidyverse.org/reference/ymd_hms.html">ymd_hms</a></span><span class="op">(</span><span class="va">time_stamp</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">date_time</span> <span class="op">&gt;=</span> <span class="fu"><a href="https://lubridate.tidyverse.org/reference/make_datetime.html">make_date</a></span><span class="op">(</span><span class="fl">2016</span>, <span class="fl">01</span>, <span class="fl">25</span><span class="op">)</span> <span class="op">&amp;</span> </span>
 <span>           <span class="va">date_time</span> <span class="op">&lt;</span> <span class="fu"><a href="https://lubridate.tidyverse.org/reference/make_datetime.html">make_date</a></span><span class="op">(</span><span class="fl">2016</span>, <span class="fl">02</span>, <span class="fl">1</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
@@ -1049,12 +1059,12 @@ <h1 class="title">
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../ml/notation-and-terminology.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../ml/conditionals.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/ml/evaluation-metrics_files/figure-html/precision-recall-1-1.png b/docs/ml/evaluation-metrics_files/figure-html/precision-recall-1-1.png
index 7d3fe8a..936b5f0 100644
Binary files a/docs/ml/evaluation-metrics_files/figure-html/precision-recall-1-1.png and b/docs/ml/evaluation-metrics_files/figure-html/precision-recall-1-1.png differ
diff --git a/docs/ml/img/cv-2.png b/docs/ml/img/cv-2.png
deleted file mode 100644
index 8c5450f..0000000
Binary files a/docs/ml/img/cv-2.png and /dev/null differ
diff --git a/docs/ml/intro-ml.html b/docs/ml/intro-ml.html
index dc8b67e..65f1037 100644
--- a/docs/ml/intro-ml.html
+++ b/docs/ml/intro-ml.html
@@ -203,23 +203,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -236,37 +242,37 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -283,31 +289,31 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -324,49 +330,49 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,7 +405,9 @@ <h1 class="title">Machine Learning</h1>
 
 </header>
 
-<p>Machine learning has achieved remarkable successes, ranging from the postal service’s handwritten zip code readers to voice recognition systems like Apple’s Siri. These advances also include movie recommendation systems, spam and malware detection, housing price prediction algorithms, and the development of driverless cars. Although <em>Artificial Intelligence (AI)</em> and <em>Machine Learning</em> are terms frequently used interchangeably today, here we distinguish between them. Traditional AI systems, exemplified by chess-playing machines, employed decision-making based on preset rules stemming from theories or fundamental principles. In contrast, machine learning makes decisions using algorithms trained with data. Furthermore, while AI typically refers to tools complete with user interfaces and ready for real-world application, the term <em>Machine Learning</em> is often reserved for the underlying ideas, concepts, and methodologies, regardless of whether a tangible tool has been developed. In this part of the book we focus on these ideas, concepts, and methodologies, but also demonstrate their application to handwritten digits.</p>
+<p>Machine learning has achieved remarkable successes in a variety of applications. These range from the postal service’s use of machine learning for reading handwritten zip codes to the development of voice recognition systems like Apple’s Siri. Other significant advances include movie recommendation systems, spam and malware detection, housing price prediction algorithms, and the ongoing development of autonomous vehicles.</p>
+<p>The field of <em>Artificial Intelligence (AI)</em> has been evolving for several decades. Traditional AI systems, including some chess-playing machines, often relied on decision-making based on preset rules and knowledge representation. However, with the advent of data availability, machine learning has gained prominence. It focuses on decision-making through algorithms trained with data. In recent years, the terms AI and Machine Learning have been used interchangeably in many contexts, though they have distinct meanings. AI broadly refers to systems or applications that exhibit intelligent behavior, encompassing both rule-based approaches and machine learning. Machine Learning specifically involves learning from data to make decisions or predictions.</p>
+<p>In this part of the book, we will delve into the concepts, ideas, and methodologies of machine learning. We will also demonstrate their practical application, using the example of recognizing handwritten digits, a classic problem that exemplifies the power and utility of machine learning techniques. Machine learning has achieved remarkable successes, ranging from the postal service’s handwritten zip code readers to voice recognition systems like Apple’s Siri. These advances also include movie recommendation systems, spam and malware detection, housing price prediction algorithms, and the development of driverless cars.</p>
 
 
 
@@ -640,12 +648,12 @@ <h1 class="title">Machine Learning</h1>
 <nav class="page-navigation">
   <div class="nav-page nav-page-previous">
       <a href="../highdim/matrix-factorization.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../ml/notation-and-terminology.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/ml/ml-in-practice.html b/docs/ml/ml-in-practice.html
index e7e5b31..97e0e77 100644
--- a/docs/ml/ml-in-practice.html
+++ b/docs/ml/ml-in-practice.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 30&nbsp; Machine learning in practice</title>
+<title>Advanced Data Science - 31&nbsp; Machine learning in practice</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/ml-in-practice.html"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/ml-in-practice.html"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -400,27 +406,31 @@
    
   <ul>
 <li>
-<a href="#sec-caret" id="toc-sec-caret" class="nav-link active" data-scroll-target="#sec-caret"><span class="header-section-number">30.1</span> The caret package</a>
+<a href="#sec-caret" id="toc-sec-caret" class="nav-link active" data-scroll-target="#sec-caret"><span class="header-section-number">31.1</span> The caret package</a>
+  <ul class="collapse">
+<li><a href="#the-train-function" id="toc-the-train-function" class="nav-link" data-scroll-target="#the-train-function"><span class="header-section-number">31.1.1</span> The <code>train</code> function</a></li>
+  <li><a href="#the-predict-function" id="toc-the-predict-function" class="nav-link" data-scroll-target="#the-predict-function"><span class="header-section-number">31.1.2</span> The <code>predict</code> function</a></li>
+  <li><a href="#sec-caret-cv" id="toc-sec-caret-cv" class="nav-link" data-scroll-target="#sec-caret-cv"><span class="header-section-number">31.1.3</span> Cross validation</a></li>
+  </ul>
+</li>
+  <li><a href="#preprocessing" id="toc-preprocessing" class="nav-link" data-scroll-target="#preprocessing"><span class="header-section-number">31.2</span> Preprocessing</a></li>
+  <li>
+<a href="#k-nearest-neighbors" id="toc-k-nearest-neighbors" class="nav-link" data-scroll-target="#k-nearest-neighbors"><span class="header-section-number">31.3</span> k-nearest neighbors</a>
   <ul class="collapse">
-<li><a href="#the-train-functon" id="toc-the-train-functon" class="nav-link" data-scroll-target="#the-train-functon"><span class="header-section-number">30.1.1</span> The <code>train</code> functon</a></li>
-  <li><a href="#sec-caret-cv" id="toc-sec-caret-cv" class="nav-link" data-scroll-target="#sec-caret-cv"><span class="header-section-number">30.1.2</span> Cross validation</a></li>
+<li><a href="#dimension-reduction-with-pca" id="toc-dimension-reduction-with-pca" class="nav-link" data-scroll-target="#dimension-reduction-with-pca"><span class="header-section-number">31.3.1</span> Dimension reduction with PCA</a></li>
   </ul>
 </li>
-  <li><a href="#preprocessing" id="toc-preprocessing" class="nav-link" data-scroll-target="#preprocessing"><span class="header-section-number">30.2</span> Preprocessing</a></li>
-  <li><a href="#k-nearest-neighbors" id="toc-k-nearest-neighbors" class="nav-link" data-scroll-target="#k-nearest-neighbors"><span class="header-section-number">30.3</span> k-nearest neighbors</a></li>
-  <li><a href="#random-forest" id="toc-random-forest" class="nav-link" data-scroll-target="#random-forest"><span class="header-section-number">30.4</span> Random Forest</a></li>
-  <li><a href="#testing-and-improving-computation-time" id="toc-testing-and-improving-computation-time" class="nav-link" data-scroll-target="#testing-and-improving-computation-time"><span class="header-section-number">30.5</span> Testing and improving computation time</a></li>
-  <li><a href="#variable-importance" id="toc-variable-importance" class="nav-link" data-scroll-target="#variable-importance"><span class="header-section-number">30.6</span> Variable importance</a></li>
-  <li><a href="#visual-assessments" id="toc-visual-assessments" class="nav-link" data-scroll-target="#visual-assessments"><span class="header-section-number">30.7</span> Visual assessments</a></li>
-  <li><a href="#ensembles" id="toc-ensembles" class="nav-link" data-scroll-target="#ensembles"><span class="header-section-number">30.8</span> Ensembles</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">30.9</span> Exercises</a></li>
+  <li><a href="#random-forest" id="toc-random-forest" class="nav-link" data-scroll-target="#random-forest"><span class="header-section-number">31.4</span> Random Forest</a></li>
+  <li><a href="#testing-and-improving-computation-time" id="toc-testing-and-improving-computation-time" class="nav-link" data-scroll-target="#testing-and-improving-computation-time"><span class="header-section-number">31.5</span> Testing and improving computation time</a></li>
+  <li><a href="#variable-importance" id="toc-variable-importance" class="nav-link" data-scroll-target="#variable-importance"><span class="header-section-number">31.6</span> Variable importance</a></li>
+  <li><a href="#diagnostics" id="toc-diagnostics" class="nav-link" data-scroll-target="#diagnostics"><span class="header-section-number">31.7</span> Diagnostics</a></li>
+  <li><a href="#ensembles" id="toc-ensembles" class="nav-link" data-scroll-target="#ensembles"><span class="header-section-number">31.8</span> Ensembles</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">31.9</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/ml/ml-in-practice.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
-<h1 class="title">
-<span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span>
-</h1>
+<h1 class="title"><span id="sec-ml-in-practice" class="quarto-section-identifier"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></h1>
 </div>
 
 
@@ -433,24 +443,24 @@ <h1 class="title">
   </div>
   
 
-</header><p>Now that we have learned several methods and explored them with simple, yet illustrative, examples, we will try them out on a real example: the MNIST digits.</p>
+</header><p>Now that we have learned several methods and explored them with simple examples, we will try them out on a real example: the MNIST digits.</p>
 <p>We can load this data using the following <strong>dslabs</strong> package:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-1_9aa6b933360029d8de9996c45d1a5804">
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-2_048150d739cf8fc7e1fb53797c885e36">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
 <span><span class="va">mnist</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/dslabs/man/read_mnist.html">read_mnist</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The dataset includes two components, a training set and test set:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-2_95dcc5061302885f7fb685cc7af1573f">
+<p>The dataset includes two components, a training set and a test set:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-3_7eb8bc12c7a71e7dbe45e2d1de04849a">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">mnist</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] "train" "test"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Each of these components includes a matrix with features in the columns:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-3_471b9b5b0e0bfed99f8ef2bb9ef5deb5">
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-4_b39dce39edfaed246bb6079a38d3eed9">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">images</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 60000   784</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>and vector with the classes as integers:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-4_ab7d96f531b33dbcfd5e31ba48b4cfd5">
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-5_747b871a6eb9828c1643fb8fa011d019">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/class.html">class</a></span><span class="op">(</span><span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">labels</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] "integer"</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/table.html">table</a></span><span class="op">(</span><span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">labels</span><span class="op">)</span></span>
@@ -459,7 +469,7 @@ <h1 class="title">
 <span><span class="co">#&gt; 5923 6742 5958 6131 5842 5421 5918 6265 5851 5949</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Because we want this example to run on a small laptop and in less than one hour, we will consider a subset of the dataset. We will sample 10,000 random rows from the training set and 1,000 random rows from the test set:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-5_d2cba156636fa4b25dd1cd7880341713">
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-6_6ec64e58041fd703c37232e25bb59738">
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1990</span><span class="op">)</span></span>
 <span><span class="va">index</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">nrow</a></span><span class="op">(</span><span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">images</span><span class="op">)</span>, <span class="fl">10000</span><span class="op">)</span></span>
 <span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">images</span><span class="op">[</span><span class="va">index</span>,<span class="op">]</span></span>
@@ -468,48 +478,76 @@ <h1 class="title">
 <span><span class="va">x_test</span> <span class="op">&lt;-</span> <span class="va">mnist</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">images</span><span class="op">[</span><span class="va">index</span>,<span class="op">]</span></span>
 <span><span class="va">y_test</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">mnist</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">labels</span><span class="op">[</span><span class="va">index</span><span class="op">]</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<section id="sec-caret" class="level2" data-number="30.1"><h2 data-number="30.1" class="anchored" data-anchor-id="sec-caret">
-<span class="header-section-number">30.1</span> The caret package</h2>
-<p>We have already learned about several machine learning algorithms. Many of these algorithms are implemented in R. However, they are distributed via different packages, developed by different authors, and often use different syntax. The <strong>caret</strong> package tries to consolidate these differences and provide consistency. It currently includes over 200 different methods which are summarized in the <strong>caret</strong> package manual<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>. Keep in mind that <strong>caret</strong> does not include the needed packages and, to implement a package through <strong>caret</strong>, you still need to install the library. The required packages for each method are described in the package manual. The <strong>caret</strong> package also provides a function that performs cross validation for us. Here we provide some examples showing how we use this incredibly helpful package. We will use the 2 or 7 example to illustrate and in later sections we use use the package to run algorithms on the larger MNIST datset.</p>
-<section id="the-train-functon" class="level3" data-number="30.1.1"><h3 data-number="30.1.1" class="anchored" data-anchor-id="the-train-functon">
-<span class="header-section-number">30.1.1</span> The <code>train</code> functon</h3>
+<section id="sec-caret" class="level2" data-number="31.1"><h2 data-number="31.1" class="anchored" data-anchor-id="sec-caret">
+<span class="header-section-number">31.1</span> The caret package</h2>
+<p>We have already learned about several machine learning algorithms. Many of these algorithms are implemented in R. However, they are distributed via different packages, developed by different authors, and often use different syntax. The <strong>caret</strong> package tries to consolidate these differences and provide consistency. It currently includes over 200 different methods which are summarized in the <strong>caret</strong> package manual<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>. Keep in mind that <strong>caret</strong> does not include the packages needed to run each possible algorithm. To apply a machine learning method through <strong>caret</strong> you still need to install the library that implement the method. The required packages for each method are described in the package manual.</p>
+<p>The <strong>caret</strong> package also provides a function that performs cross validation for us. Here we provide some examples showing how we use this helpful package. We will first use the 2 or 7 example to illustrate and, in later sections, we use the package to run algorithms on the larger MNIST dataset.</p>
+<section id="the-train-function" class="level3" data-number="31.1.1"><h3 data-number="31.1.1" class="anchored" data-anchor-id="the-train-function">
+<span class="header-section-number">31.1.1</span> The <code>train</code> function</h3>
+<p>The R functions that fit machine algorithms are all slightly different. Functions such as <code>lm</code>, <code>glm</code>, <code>qda</code>, <code>lda</code>, <code>knn3</code>, <code>rpart</code> and <code>randomForrest</code> use different syntax, have different argument names and produce objects of different types.</p>
 <p>The <strong>caret</strong> <code>train</code> function lets us train different algorithms using similar syntax. So, for example, we can type:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-6_3b557ce83793eaa582ed669d13232b55">
+<div class="cell" data-layout-align="center">
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span>
 <span><span class="co">#&gt; Loading required package: lattice</span></span>
 <span><span class="va">train_glm</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"glm"</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">)</span></span>
+<span><span class="va">train_qda</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"qda"</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">)</span></span>
 <span><span class="va">train_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"knn"</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To make predictions, we can use the output of this function directly without needing to look at the specifics of <code>predict.glm</code> and <code>predict.knn</code>. Instead, we can learn how to obtain predictions from <code>predict.train</code>.</p>
-<p>The code looks the same for both methods:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-7_bf60954ea6db6088594edbb91bce6ea4">
-<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_glm</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_glm</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"raw"</span><span class="op">)</span></span>
+<p>As we explain in more detail in <a href="#sec-caret-cv"><span>Section&nbsp;31.1.3</span></a>, the <code>train</code> function selects parameters for you using a resampling method, with boostrap as the default.</p>
+</section><section id="the-predict-function" class="level3" data-number="31.1.2"><h3 data-number="31.1.2" class="anchored" data-anchor-id="the-predict-function">
+<span class="header-section-number">31.1.2</span> The <code>predict</code> function</h3>
+<p>The <code>predict</code> function is very useful for machine learning applications. This function takes an object from a fitting function and a data frame with features <span class="math inline">\(\mathbf{x}\)</span> for which to predict, and returns predictions for these features.</p>
+<p>Here is an example with logistic regression:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-8_b9bcfd4786dadd581081457169ca78df">
+<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/glm.html">glm</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>, family <span class="op">=</span> <span class="st">"binomial"</span><span class="op">)</span></span>
+<span><span class="va">p_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>In this case, the function is simply computing:</p>
+<p><span class="math display">\[
+\hat{p}(\mathbf{x}) = g^{-1}\left(\hat{\beta}_0 + \hat{\beta}_1 x_1 + \hat{\beta}_2 x_2 \right) \text{ with } g(p) = \log\frac{p}{1-p} \implies g^{-1}(\mu) = \frac{1}{1-e^{-\mu}}
+\]</span></p>
+<p>for the <code>x_1</code> and <code>x_2</code> in the test set <code>mnist_27$test</code>. With these estimates in place, we can make our predictions and compute our accuracy:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-9_7d31c8e6840b8cb4ada96f231070af67">
+<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">p_hat</span> <span class="op">&gt;</span> <span class="fl">0.5</span>, <span class="fl">7</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>However, note that <code>predict</code> does not always return objects of the same types; it depends on what type of object it is applied to. To learn about the specifics, you need to look at the help file specific for the type of fit object that is being used. The <code>predict</code> is actually a special type of function in R (called a <em>generic function</em>) that calls other functions depending on what kind of object it receives. So if <code>predict</code> receives an object coming out of the <code>lm</code> function, it will call <code>predict.glm</code>. If it receives an object coming out of <code>glm</code>, it calls <code>predict.qda</code>. If the fit is from <code>knn3</code>, it calls <code>predict.knn3</code>, and so on. These functions are similar but not exactly. You can learn more about the differences by reading the help files:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-10_bd685df86558439e32162720bed08966">
+<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="op">?</span><span class="va">predict.glm</span></span>
+<span><span class="op">?</span><span class="va">predict.qda</span></span>
+<span><span class="op">?</span><span class="va">predict.knn3</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>There are many other versions of <code>predict</code> and many machine learning algorithms have a <code>predict</code> function.</p>
+<p>As with <code>train</code>, the <strong>caret</strong> packages unifies the use of <code>predict</code> with the function <code>predict.train</code>. This function takes the output of <code>train</code> and produces prediction of categories or estimates of <span class="math inline">\(p(\mathbf{x})\)</span>.</p>
+<p>The code looks the same for all methods:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-11_8683ddf64fbc93af1d792e622aefa098">
+<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_glm</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_glm</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"raw"</span><span class="op">)</span></span>
+<span><span class="va">y_hat_qda</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_qda</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"raw"</span><span class="op">)</span></span>
 <span><span class="va">y_hat_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_knn</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"raw"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>This permits us to quickly compare the algorithms. For example, we can compare the accuracy like this:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-8_25febfdb7192b7b35bc9c017c93ed252">
-<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fits</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html">list</a></span><span class="op">(</span>glm <span class="op">=</span> <span class="va">y_hat_glm</span>, knn <span class="op">=</span> <span class="va">y_hat_knn</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-12_a6cdb42608eca224555e43d589ce114d">
+<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fits</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html">list</a></span><span class="op">(</span>glm <span class="op">=</span> <span class="va">y_hat_glm</span>, qda <span class="op">=</span> <span class="va">y_hat_qda</span>, knn <span class="op">=</span> <span class="va">y_hat_knn</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">fits</span>, <span class="kw">function</span><span class="op">(</span><span class="va">fit</span><span class="op">)</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">fit</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[[</span><span class="st">"Accuracy"</span><span class="op">]</span><span class="op">]</span><span class="op">)</span></span>
-<span><span class="co">#&gt;  glm  knn </span></span>
-<span><span class="co">#&gt; 0.75 0.84</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt;   glm   qda   knn </span></span>
+<span><span class="co">#&gt; 0.775 0.815 0.835</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="sec-caret-cv" class="level3" data-number="30.1.2"><h3 data-number="30.1.2" class="anchored" data-anchor-id="sec-caret-cv">
-<span class="header-section-number">30.1.2</span> Cross validation</h3>
+</section><section id="sec-caret-cv" class="level3" data-number="31.1.3"><h3 data-number="31.1.3" class="anchored" data-anchor-id="sec-caret-cv">
+<span class="header-section-number">31.1.3</span> Cross validation</h3>
 <p>When an algorithm includes a tuning parameter, <code>train</code> automatically uses cross validation to decide among a few default values. To find out what parameter or parameters are optimized, you can read the manual <a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a> or study the output of:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-9_6f9b5d3e50a78f50f24371c63573a41d">
-<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/modelLookup.html">getModelInfo</a></span><span class="op">(</span><span class="st">"knn"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-13_3ab2878739ebd6258109ed2ee986d7a4">
+<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/modelLookup.html">getModelInfo</a></span><span class="op">(</span><span class="st">"knn"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We can also use a quick lookup like this:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-10_3b58d415f8ca34cca5779a3379bef792">
-<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/modelLookup.html">modelLookup</a></span><span class="op">(</span><span class="st">"knn"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-14_1e612f72578b39cb7e9d4bf08f422d8f">
+<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/modelLookup.html">modelLookup</a></span><span class="op">(</span><span class="st">"knn"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>If we run it with default values:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-11_d330aa89ac7e53f1cc45fef541fb692c">
-<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"knn"</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-15_d48c2f289af4f8c00b65791cf8a65c43">
+<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"knn"</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>you can quickly see the results of the cross validation using the <code>ggplot</code> function. The argument <code>highlight</code> highlights the max:</p>
 <div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/caret-highlight_d17c6c5125c1ffb3ed5ff21ef6c364a2">
-<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="va">train_knn</span>, highlight <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="va">train_knn</span>, highlight <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="ml-in-practice_files/figure-html/caret-highlight-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -517,10 +555,10 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>By default, the cross validation is performed by taking 25 bootstrap samples comprised of 25% of the observations. For the <code>kNN</code> method, the default is to try <span class="math inline">\(k=5,7,9\)</span>. We change this using the <code>tuneGrid</code> parameter. The grid of values must be supplied by a data frame with the parameter names as specified in the <code>modelLookup</code> output.</p>
-<p>Here, we present an example where we try out 30 values between 9 and 67. To do this with <strong>caret</strong>, we need to define a column named <code>k</code>, so we use this: <code>data.frame(k = seq(9, 67, 2))</code>. Note that when running this code, we are fitting 30 versions of kNN to 25 bootstrapped samples. Since we are fitting <span class="math inline">\(30 \times 25 = 750\)</span> kNN models, running this code will take several seconds. We set the seed because cross validation is a random procedure and we want to make sure the result here is reproducible.</p>
+<p>By default, the cross validation is performed by taking 25 bootstrap samples comprised of 25% of the observations. For the <code>kNN</code> method, the default is to try <span class="math inline">\(k=5,7,9\)</span>. We change this using the <code>tuneGrid</code> argument. The grid of values must be supplied by a data frame with the parameter names as specified in the <code>modelLookup</code> output.</p>
+<p>Here we present an example where we try out 30 values between 9 and 67. To do this with <strong>caret</strong>, we need to define a column named <code>k</code>, so we use this: <code>data.frame(k = seq(9, 67, 2))</code>. Note that when running this code, we are fitting 30 versions of kNN to 25 bootstrapped samples. Since we are fitting <span class="math inline">\(30 \times 25 = 750\)</span> kNN models, running this code will take several seconds. We set the seed because cross validation is a random procedure and we want to make sure the result here is reproducible.</p>
 <div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/train-knn-plot_97100927b08b3dc802b2fa12e13e2553">
-<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">2008</span><span class="op">)</span></span>
+<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">2008</span><span class="op">)</span></span>
 <span><span class="va">train_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"knn"</span>, </span>
 <span>                   data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>,</span>
 <span>                   tuneGrid <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>k <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">9</span>, <span class="fl">71</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
@@ -533,30 +571,30 @@ <h1 class="title">
 </div>
 </div>
 <p>To access the parameter that maximized the accuracy, you can use this:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-12_c9d5c4f9b192984759f8b52b256441dd">
-<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_knn</span><span class="op">$</span><span class="va">bestTune</span></span>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-16_edb857e7aef1e8075e67b91207bbca4f">
+<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_knn</span><span class="op">$</span><span class="va">bestTune</span></span>
 <span><span class="co">#&gt;     k</span></span>
-<span><span class="co">#&gt; 10 27</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt; 27 61</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>and the best performing model like this:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-13_a6e7469425d3a4e9aa77736cccb8bfee">
-<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_knn</span><span class="op">$</span><span class="va">finalModel</span></span>
-<span><span class="co">#&gt; 27-nearest neighbor model</span></span>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-17_13e982acc3a5d60398ac6e1dee7a55b9">
+<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_knn</span><span class="op">$</span><span class="va">finalModel</span></span>
+<span><span class="co">#&gt; 61-nearest neighbor model</span></span>
 <span><span class="co">#&gt; Training set outcome distribution:</span></span>
 <span><span class="co">#&gt; </span></span>
 <span><span class="co">#&gt;   2   7 </span></span>
-<span><span class="co">#&gt; 379 421</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt; 401 399</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The function <code>predict</code> will use this best performing model. Here is the accuracy of the best model when applied to the test set, which we have not used at all yet because the cross validation was done on the training set:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-14_91f9895bfc2bff1ccdc99d6eee3a982a">
-<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_knn</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"raw"</span><span class="op">)</span>,</span>
+<p>The function <code>predict</code> will use this best performing model. Here is the accuracy of the best model when applied to the test set, which we have not yet used because the cross validation was done on the training set:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-18_c45eec151fe0bc13be46164139737734">
+<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">train_knn</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span>, type <span class="op">=</span> <span class="st">"raw"</span><span class="op">)</span>,</span>
 <span>                <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
 <span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;    0.835</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt;    0.825</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>If we want to change how we perform cross validation, we can use the <code>trainControl</code> function. We can make the code above go a bit faster by using, for example, 10-fold cross validation. This means we have 10 samples using 10% of the observations each. We accomplish this using the following code:</p>
 <div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/cv-10-fold-accuracy-estimate_d4212fede48073bbaf84f9b33a8cfec9">
-<div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">control</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/trainControl.html">trainControl</a></span><span class="op">(</span>method <span class="op">=</span> <span class="st">"cv"</span>, number <span class="op">=</span> <span class="fl">10</span>, p <span class="op">=</span> <span class="fl">.9</span><span class="op">)</span></span>
+<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">control</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/trainControl.html">trainControl</a></span><span class="op">(</span>method <span class="op">=</span> <span class="st">"cv"</span>, number <span class="op">=</span> <span class="fl">10</span>, p <span class="op">=</span> <span class="fl">.9</span><span class="op">)</span></span>
 <span><span class="va">train_knn_cv</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">.</span>, method <span class="op">=</span> <span class="st">"knn"</span>, </span>
 <span>                   data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span>,</span>
 <span>                   tuneGrid <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>k <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">9</span>, <span class="fl">71</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span>,</span>
@@ -569,20 +607,20 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>We notice that the accuracy estimates are more variable, which is expected since we changed the number of samples used to estimate accuracy.</p>
+<p>We observe that the accuracy estimates are more variable, which is expected since we changed the number of samples used to estimate accuracy.</p>
 <p>Note that <code>results</code> component of the <code>train</code> output includes several summary statistics related to the variability of the cross validation estimates:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-15_7ae4dcaea59b4d12d40efff415130b1a">
-<div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">train_knn</span><span class="op">$</span><span class="va">results</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-19_afd39f491444a16266ca8312c20c2e8c">
+<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/names.html">names</a></span><span class="op">(</span><span class="va">train_knn</span><span class="op">$</span><span class="va">results</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] "k"          "Accuracy"   "Kappa"      "AccuracySD" "KappaSD"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We have only covered the basics. To <strong>caret</strong> package manual <a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a> includes many more details.</p>
-</section></section><section id="preprocessing" class="level2" data-number="30.2"><h2 data-number="30.2" class="anchored" data-anchor-id="preprocessing">
-<span class="header-section-number">30.2</span> Preprocessing</h2>
-<p>In machine learning, we often transform predictors before running the machine algorithm. We also remove predictors that are clearly not useful. We call these steps <em>preprocessing</em>.</p>
-<p>Examples of preprocessing include standardizing the predictors, taking the log transform of some predictors, removing predictors that are highly correlated with others, and removing predictors with very few non-unique values or close to zero variation. We show an example below.</p>
-<p>We can run the <code>nearZero</code> function from the <strong>caret</strong> package to see that several features do not vary much from observation to observation. We can see that there is a large number of features with 0 variability:</p>
+<p>You can learn many more details about the <strong>caret</strong> package, from the manual <a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>.</p>
+</section></section><section id="preprocessing" class="level2" data-number="31.2"><h2 data-number="31.2" class="anchored" data-anchor-id="preprocessing">
+<span class="header-section-number">31.2</span> Preprocessing</h2>
+<p>We often transform predictors before running the machine algorithm. We also remove predictors that are clearly not useful. We call these steps <em>preprocessing</em>.</p>
+<p>Examples of preprocessing include standardizing the predictors, taking the log transform of some predictors, removing predictors that are highly correlated with others, and removing predictors with very few non-unique values or close to zero variation.</p>
+<p>For example, we can run the <code>nearZero</code> function from the <strong>caret</strong> package to see that several features do not vary much from observation to observation. We can see that there is a large number of features with close to 0 variability:</p>
 <div class="cell" data-layout-align="center">
-<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/HenrikBengtsson/matrixStats">matrixStats</a></span><span class="op">)</span></span>
+<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/HenrikBengtsson/matrixStats">matrixStats</a></span><span class="op">)</span></span>
 <span><span class="va">sds</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/matrixStats/man/rowSds.html">colSds</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/hist.html">hist</a></span><span class="op">(</span><span class="va">sds</span>, breaks <span class="op">=</span> <span class="fl">256</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
@@ -594,15 +632,15 @@ <h1 class="title">
 </div>
 <p>This is expected because there are parts of the image that rarely contain writing (dark pixels).</p>
 <p>The <strong>caret</strong> packages includes a function that recommends features to be removed due to <em>near zero variance</em>:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-16_fbefe38de87776620ad72c30516221f4">
-<div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">nzv</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/nearZeroVar.html">nearZeroVar</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-20_f94456d5577c664be9aaeb56fb3f8e28">
+<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">nzv</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/nearZeroVar.html">nearZeroVar</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can see the columns recommended for removal:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-17_4485b77166f811536464e580066146e1">
-<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">784</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">nzv</span>, <span class="fl">28</span>, <span class="fl">28</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>We can see the columns recommended for removal are the near the edges:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-21_a8860d50f954b61c53b3397a9f737a08">
+<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">784</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">nzv</span>, <span class="fl">28</span>, <span class="fl">28</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/near-zero-image_8b5146573c82c3b0b8098b5899086a71">
-<div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">rafalib</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/rafalib/man/mypar.html">mypar</a></span><span class="op">(</span><span class="op">)</span></span>
+<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">rafalib</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/rafalib/man/mypar.html">mypar</a></span><span class="op">(</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">784</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">nzv</span>, <span class="fl">28</span>, <span class="fl">28</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -612,137 +650,147 @@ <h1 class="title">
 </div>
 </div>
 <p>So we end up keeping this number of columns:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-18_974961747f8ce0636e4d1d3617ee0452">
-<div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">col_index</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sets.html">setdiff</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, <span class="va">nzv</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-22_d31a4c09920d473f474883077d85e35e">
+<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">col_index</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sets.html">setdiff</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, <span class="va">nzv</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">col_index</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 252</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Now we are ready to fit some models. Before we start, we need to add column names to the feature matrices as these are required by <strong>caret</strong>:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-19_8134330fba1c0fe8a85de99eb48f756d">
-<div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">images</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-23_6b365df90dbda653edbd3e4b02d4c4d9">
+<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">mnist</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">images</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">x_test</span><span class="op">)</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colnames.html">colnames</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="k-nearest-neighbors" class="level2" data-number="30.3"><h2 data-number="30.3" class="anchored" data-anchor-id="k-nearest-neighbors">
-<span class="header-section-number">30.3</span> k-nearest neighbors</h2>
-<p>Before starting this section, <strong>be warned</strong> that the first two calls to the <code>train</code> function in the code below can take several hours to run. This is common challenge when training machine learning algorithms since we have to run the algorithm for each cross validation split and each set of tuning parameter being considered. In the next section, we will provide some suggestion on how to predict the duration of the process and ways to reduce.</p>
+</section><section id="k-nearest-neighbors" class="level2" data-number="31.3"><h2 data-number="31.3" class="anchored" data-anchor-id="k-nearest-neighbors">
+<span class="header-section-number">31.3</span> k-nearest neighbors</h2>
+<div class="callout callout-style-simple callout-warning">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>Before starting this section, note that the first two calls to the <code>train</code> function in the code below can take several hours to run. This is a common challenge when training machine learning algorithms since we have to run the algorithm for each cross validation split and each set of tuning parameters being considered. In the next section, we will provide some suggestions on how to predict the duration of the process and ways to reduce.</p>
+</div>
+</div>
+</div>
 <p>The first step is to optimize for <span class="math inline">\(k\)</span>.</p>
 <div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/mnist-knn-fit_6456da2f79a11665e0f69ed74ad8e6de">
-<div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span> ,<span class="va">col_index</span><span class="op">]</span>, <span class="va">y</span>, </span>
+<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span> ,<span class="va">col_index</span><span class="op">]</span>, <span class="va">y</span>, </span>
 <span>                   method <span class="op">=</span> <span class="st">"knn"</span>, </span>
 <span>                   tuneGrid <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>k <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">3</span>, <span class="fl">13</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Once we optimize our algorithm, we can fit it to the entire dataset:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-20_8c72378a54626a5cee0866a6face1fce">
-<div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/knn3.html">knn3</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span>, <span class="va">col_index</span><span class="op">]</span>, <span class="va">y</span>,  k <span class="op">=</span> <span class="va">train_knn</span><span class="op">$</span><span class="va">bestTune</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-24_0ebca7c1b3c7b4bd3300b097f71209ef">
+<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/knn3.html">knn3</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span>, <span class="va">col_index</span><span class="op">]</span>, <span class="va">y</span>,  k <span class="op">=</span> <span class="va">train_knn</span><span class="op">$</span><span class="va">bestTune</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We achieve a high accuracy:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-22_2560c324bf3dc7fe2f4c8ebf3df8ec49">
-<div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit_knn</span>, <span class="va">x_test</span><span class="op">[</span>, <span class="va">col_index</span><span class="op">]</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-26_b133fbe3cb13da6bf396cc724222fc90">
+<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit_knn</span>, <span class="va">x_test</span><span class="op">[</span>, <span class="va">col_index</span><span class="op">]</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat_knn</span>, <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y_test</span><span class="op">)</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
 <span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;    0.945</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt;    0.944</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>An alternative to removing low variance columns directly, is to use dimension reduction on the feature matrix before applying the algorithms. It is important that we not use the test set when finding the PCs nor any summary of the data, as this could result in overtraining. So we start by applying <code>prcomp</code> to the training data:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-23_b6242f4379073c3e5a1414884b1772b8">
-<div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">col_means</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colMeans</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
+<section id="dimension-reduction-with-pca" class="level3" data-number="31.3.1"><h3 data-number="31.3.1" class="anchored" data-anchor-id="dimension-reduction-with-pca">
+<span class="header-section-number">31.3.1</span> Dimension reduction with PCA</h3>
+<p>An alternative to removing low variance columns directly is to use dimension reduction on the feature matrix before applying the algorithms. It is important that we not use the test set when finding the PCs nor any summary of the data, as this could result in overtraining. So we start by applying <code>prcomp</code> to the training data:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-27_96ccfc06d2a8c6ac0c0f32b134600967">
+<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">col_means</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">colMeans</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="va">pca</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/prcomp.html">prcomp</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">x</span>, <span class="fl">2</span>, <span class="va">col_means</span><span class="op">)</span>, center <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Next, and run knn on just a small number of dimensions. We try 36 dimensions since this explains about 80% of the data.</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-24_a520a38c36e1a5864de4e6a984d48f05">
-<div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">k</span> <span class="op">&lt;-</span> <span class="fl">36</span></span>
+<p>Next, we run kNN on just a small number of dimensions. We try 36 dimensions since this explains about 80% of the data.</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-28_f25d49e9a9c4257f2f3f8dc935d1c233">
+<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">k</span> <span class="op">&lt;-</span> <span class="fl">36</span></span>
 <span><span class="va">x_train</span> <span class="op">&lt;-</span> <span class="va">pca</span><span class="op">$</span><span class="va">x</span><span class="op">[</span>,<span class="fl">1</span><span class="op">:</span><span class="va">k</span><span class="op">]</span></span>
 <span><span class="va">train_knn</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">x_train</span>, <span class="va">y</span>, </span>
 <span>                   method <span class="op">=</span> <span class="st">"knn"</span>, </span>
 <span>                   tuneGrid <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>k <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">3</span>, <span class="fl">13</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/knn3.html">knn3</a></span><span class="op">(</span><span class="va">x_train</span>, <span class="va">y</span>, k <span class="op">=</span> <span class="va">train_knn</span><span class="op">$</span><span class="va">bestTune</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="va">fit_knn_pca</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/knn3.html">knn3</a></span><span class="op">(</span><span class="va">x_train</span>, <span class="va">y</span>, k <span class="op">=</span> <span class="va">train_knn</span><span class="op">$</span><span class="va">bestTune</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Now we apply the transformation we learned with the training data to the test data, reduce the dimension, and then run predict. Note that we used the rotation and column means estimated from the training data.</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-26_cef45cc649be872a0b481095513dd316">
-<div class="sourceCode" id="cb30"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit</span>, <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">x_test</span>, <span class="fl">2</span>, <span class="va">col_means</span><span class="op">)</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="va">pca</span><span class="op">$</span><span class="va">rotation</span><span class="op">[</span>,<span class="fl">1</span><span class="op">:</span><span class="va">k</span><span class="op">]</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y_test</span><span class="op">)</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-30_a730fff03031462fdb3978f210950188">
+<div class="sourceCode" id="cb33"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">newdata</span> <span class="op">&lt;-</span>  <span class="fu"><a href="https://rdrr.io/r/base/sweep.html">sweep</a></span><span class="op">(</span><span class="va">x_test</span>, <span class="fl">2</span>, <span class="va">col_means</span><span class="op">)</span> <span class="op"><a href="https://rdrr.io/r/base/matmult.html">%*%</a></span> <span class="va">pca</span><span class="op">$</span><span class="va">rotation</span><span class="op">[</span>,<span class="fl">1</span><span class="op">:</span><span class="va">k</span><span class="op">]</span></span>
+<span><span class="va">y_hat_knn_pca</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit_knn_pca</span>, <span class="va">newdata</span>, type <span class="op">=</span> <span class="st">"class"</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat_knn_pca</span>, <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="va">y_test</span><span class="op">)</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
 <span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;    0.962</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt;     0.96</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>We obtain an improvement in accuracy, while using only 36 dimensions.</p>
+<div class="callout callout-style-simple callout-note">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-body-container">
+<p>Remember the entire algorithm needs to be developed on the training data. This is why the column means, subtracted from test set columns, and the rotation applied to obtained PCs, were both computed using the train data.</p>
+</div>
 </div>
-<p>With obtain an improvement in accuracy, while using only 36 dimensions.</p>
-</section><section id="random-forest" class="level2" data-number="30.4"><h2 data-number="30.4" class="anchored" data-anchor-id="random-forest">
-<span class="header-section-number">30.4</span> Random Forest</h2>
+</div>
+</section></section><section id="random-forest" class="level2" data-number="31.4"><h2 data-number="31.4" class="anchored" data-anchor-id="random-forest">
+<span class="header-section-number">31.4</span> Random Forest</h2>
 <p>With the random forest algorithm several parameters can be optimized, but the main one is <code>mtry</code>, the number of predictors that are randomly selected for each tree. This is also the only tuning parameter that the <strong>caret</strong> function <code>train</code> permits when using the default implementation from the <strong>randomForest</strong> package.</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-27_068ab7f92917f79d9b5d1dcb11c8f574">
-<div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://www.stat.berkeley.edu/~breiman/RandomForests/">randomForest</a></span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-31_a9b34eb185ec94570edf1893e4bfb3f0">
+<div class="sourceCode" id="cb34"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://www.stat.berkeley.edu/~breiman/RandomForests/">randomForest</a></span><span class="op">)</span></span>
 <span><span class="va">grid</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>mtry <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">3</span>, <span class="fl">24</span>, <span class="fl">3</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">train_rf</span> <span class="op">&lt;-</span>  <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span>, <span class="va">col_index</span><span class="op">]</span>, <span class="va">y</span>, </span>
 <span>                   method <span class="op">=</span> <span class="st">"rf"</span>, </span>
 <span>                   tuneGrid <span class="op">=</span> <span class="va">grid</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Now that we have optimized our algorithm, we are ready to fit our final model:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-28_40d35b6720c598a8f35149feb5c3ecaf">
-<div class="sourceCode" id="cb32"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit_rf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/randomForest/man/randomForest.html">randomForest</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span>, <span class="va">col_index</span><span class="op">]</span>, <span class="va">y</span>, mtry <span class="op">=</span> <span class="va">train_rf</span><span class="op">$</span><span class="va">bestTune</span><span class="op">$</span><span class="va">mtry</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-layout-align="center">
-<pre><code>#&gt; randomForest 4.7-1.1
-#&gt; Type rfNews() to see new features/changes/bug fixes.
-#&gt; 
-#&gt; Attaching package: 'randomForest'
-#&gt; The following object is masked from 'package:ggplot2':
-#&gt; 
-#&gt;     margin</code></pre>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-32_722be15713298270f2566c696ca8b09b">
+<div class="sourceCode" id="cb35"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit_rf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/randomForest/man/randomForest.html">randomForest</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span>, <span class="va">col_index</span><span class="op">]</span>, <span class="va">y</span>, mtry <span class="op">=</span> <span class="va">train_rf</span><span class="op">$</span><span class="va">bestTune</span><span class="op">$</span><span class="va">mtry</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>As with kNN, we also achieve high accuracy:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-31_b90cc07af67ba07f1091af825600e4d3">
-<div class="sourceCode" id="cb34"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_rf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit_rf</span>, <span class="va">x_test</span><span class="op">[</span> ,<span class="va">col_index</span><span class="op">]</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-35_6daf69f16bff72bb1a695eb57df5f3e5">
+<div class="sourceCode" id="cb36"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">y_hat_rf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit_rf</span>, <span class="va">x_test</span><span class="op">[</span> ,<span class="va">col_index</span><span class="op">]</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat_rf</span>, <span class="va">y_test</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
 <span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;    0.954</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>By optimizing some of the other algorithm parameters we can achieve even higher accuracy.</p>
-</section><section id="testing-and-improving-computation-time" class="level2" data-number="30.5"><h2 data-number="30.5" class="anchored" data-anchor-id="testing-and-improving-computation-time">
-<span class="header-section-number">30.5</span> Testing and improving computation time</h2>
-<p>The default method for estimating accuracy used by the <code>train</code> function is to test prediction on 25 bootstrap samples. This can result in long compute times. For examples, if we are considering several values, say 10, of the tuning parameters, we will fit the algorithm 250 times. We can use the <code>system.time</code> function to estimate the how long it takes to run the algorithm once</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-32_1c8c9428b3b0f739bf2c914c52f95b8f">
-<div class="sourceCode" id="cb35"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/system.time.html">system.time</a></span><span class="op">(</span><span class="va">fit_rf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/randomForest/man/randomForest.html">randomForest</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span>, <span class="va">col_index</span><span class="op">]</span>, <span class="va">y</span>,  mtry <span class="op">=</span> <span class="fl">9</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="co">#&gt;    0.955</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>By optimizing some of the other algorithm parameters, we can achieve even higher accuracy.</p>
+</section><section id="testing-and-improving-computation-time" class="level2" data-number="31.5"><h2 data-number="31.5" class="anchored" data-anchor-id="testing-and-improving-computation-time">
+<span class="header-section-number">31.5</span> Testing and improving computation time</h2>
+<p>The default method for estimating accuracy used by the <code>train</code> function is to test prediction on 25 bootstrap samples. This can result in long compute times. For example, if we are considering several values, say 10, of the tuning parameters, we will fit the algorithm 250 times. We can use the <code>system.time</code> function to estimate how long it takes to run the algorithm once:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-36_0f8756ca9d2f7277fb54fe0000023a5a">
+<div class="sourceCode" id="cb37"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/system.time.html">system.time</a></span><span class="op">(</span><span class="op">{</span><span class="va">fit_rf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/randomForest/man/randomForest.html">randomForest</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span>, <span class="va">col_index</span><span class="op">]</span>, <span class="va">y</span>,  mtry <span class="op">=</span> <span class="fl">9</span><span class="op">)</span><span class="op">}</span><span class="op">)</span></span>
 <span><span class="co">#&gt;    user  system elapsed </span></span>
-<span><span class="co">#&gt;   60.61    0.59   61.25</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt;  59.707   0.046  59.791</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>and use this to estimate the total time for the 250 iterations. In this case it will be several hours.</p>
 <p>One way to reduce run time is to use k-fold cross validation with a smaller number of test sets. A popular choice is leaving out 10 test sets with 10% of the data:</p>
 <div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/mnist-knn-fit-control_7d84fe9432f08fd5a97cb5a70aafd808">
-<div class="sourceCode" id="cb36"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">control</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/trainControl.html">trainControl</a></span><span class="op">(</span>method <span class="op">=</span> <span class="st">"cv"</span>, number <span class="op">=</span> <span class="fl">10</span>, p <span class="op">=</span> <span class="fl">.9</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb38"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">control</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/caret/man/trainControl.html">trainControl</a></span><span class="op">(</span>method <span class="op">=</span> <span class="st">"cv"</span>, number <span class="op">=</span> <span class="fl">10</span>, p <span class="op">=</span> <span class="fl">.9</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>and re-running the <code>train</code> function with this choice specified via the <code>trControl</code> argument:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-33_7696c321ad4b3d970a21357067a14053">
-<div class="sourceCode" id="cb37"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_rf</span> <span class="op">&lt;-</span>  <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span>, <span class="va">col_index</span><span class="op">]</span>, <span class="va">y</span>, </span>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-37_b017d8d2ef32d5f63818675e206d69a4">
+<div class="sourceCode" id="cb39"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">train_rf</span> <span class="op">&lt;-</span>  <span class="fu"><a href="https://rdrr.io/pkg/caret/man/train.html">train</a></span><span class="op">(</span><span class="va">x</span><span class="op">[</span>, <span class="va">col_index</span><span class="op">]</span>, <span class="va">y</span>, </span>
 <span>                   method <span class="op">=</span> <span class="st">"rf"</span>, </span>
 <span>                   tuneGrid <span class="op">=</span> <span class="va">grid</span>,</span>
 <span>                   trControl <span class="op">=</span> <span class="va">control</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>For random forest we can also speed up the training step by running less trees per fit. After running the algorithm once, we can use the plot function to see how the error rate changes as the number of trees grows. Here we</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-34_7257d709764f6c0cc4ce61ddb7102ee7">
-<div class="sourceCode" id="cb38"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">fit_rf</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>For random forest, we can also speed up the training step by running less trees per fit. After running the algorithm once, we can use the plot function to see how the error rate changes as the number of trees grows.</p>
+<p>Here we can see that error rate stabilizes after about 200 trees:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-38_121fe9c6494da940ee956c650937192f">
+<div class="sourceCode" id="cb40"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/graphics/plot.default.html">plot</a></span><span class="op">(</span><span class="va">fit_rf</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="ml-in-practice_files/figure-html/unnamed-chunk-34-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
+<figure class="figure"><p><img src="ml-in-practice_files/figure-html/unnamed-chunk-38-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
 </figure>
 </div>
 </div>
 </div>
-<p>We can see that error rate stabilizes after about 200 trees. We can use this finding to speed up the cross validation procedure. Specifically, because the default is 500, by adding the argument <code>ntree = 200</code> to the call to <code>train</code> above, the procedure will finish 2.5 times faster.</p>
-</section><section id="variable-importance" class="level2" data-number="30.6"><h2 data-number="30.6" class="anchored" data-anchor-id="variable-importance">
-<span class="header-section-number">30.6</span> Variable importance</h2>
+<p>We can use this finding to speed up the cross validation procedure. Specifically, because the default is 500, by adding the argument <code>ntree = 200</code> to the call to <code>train</code> above, the procedure will finish 2.5 times faster.</p>
+</section><section id="variable-importance" class="level2" data-number="31.6"><h2 data-number="31.6" class="anchored" data-anchor-id="variable-importance">
+<span class="header-section-number">31.6</span> Variable importance</h2>
 <p>The following function computes the importance of each feature:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-35_0abd53d72027e70960866864837d532a">
-<div class="sourceCode" id="cb39"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">imp</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/randomForest/man/importance.html">importance</a></span><span class="op">(</span><span class="va">fit_rf</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-39_bf00918359116727683475b65982a144">
+<div class="sourceCode" id="cb41"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">imp</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/randomForest/man/importance.html">importance</a></span><span class="op">(</span><span class="va">fit_rf</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We can see which features are being used most by plotting an image:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-36_ceba3f5834e626da0508d6a285b784ff">
-<div class="sourceCode" id="cb40"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html">rep</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-40_49397e287dc2567c5e464b4bf7dbcc4c">
+<div class="sourceCode" id="cb42"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html">rep</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">mat</span><span class="op">[</span><span class="va">col_index</span><span class="op">]</span> <span class="op">&lt;-</span> <span class="va">imp</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="va">mat</span>, <span class="fl">28</span>, <span class="fl">28</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/importance-image_a8a32cd95999d93aa5a85ba876de0e34">
-<div class="sourceCode" id="cb41"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">rafalib</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/rafalib/man/mypar.html">mypar</a></span><span class="op">(</span><span class="op">)</span></span>
-<span><span class="va">mat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html">rep</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="va">mat</span><span class="op">[</span><span class="va">col_index</span><span class="op">]</span> <span class="op">&lt;-</span> <span class="va">imp</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/graphics/image.html">image</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/matrix.html">matrix</a></span><span class="op">(</span><span class="va">mat</span>, <span class="fl">28</span>, <span class="fl">28</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/importance-image_05260c3ca541817369d0578be7a25dfc">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="ml-in-practice_files/figure-html/importance-image-1.png" class="img-fluid figure-img" style="width:50.0%"></p>
@@ -750,8 +798,8 @@ <h1 class="title">
 </div>
 </div>
 </div>
-</section><section id="visual-assessments" class="level2" data-number="30.7"><h2 data-number="30.7" class="anchored" data-anchor-id="visual-assessments">
-<span class="header-section-number">30.7</span> Visual assessments</h2>
+</section><section id="diagnostics" class="level2" data-number="31.7"><h2 data-number="31.7" class="anchored" data-anchor-id="diagnostics">
+<span class="header-section-number">31.7</span> Diagnostics</h2>
 <p>An important part of data analysis is visualizing results to determine why we are failing. How we do this depends on the application. Below we show the images of digits for which we made an incorrect prediction. Here are some errors for the random forest:</p>
 <div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/rf-images,_c790a3692bde4d560c36bb262ca2859c">
 <div class="cell-output-display">
@@ -761,28 +809,82 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>By examining errors like this we often find specific weaknesses to algorithms or parameter choices and can try to correct them.</p>
-</section><section id="ensembles" class="level2" data-number="30.8"><h2 data-number="30.8" class="anchored" data-anchor-id="ensembles">
-<span class="header-section-number">30.8</span> Ensembles</h2>
+<p>By examining errors like this, we often find specific weaknesses to algorithms or parameter choices and can try to correct them.</p>
+</section><section id="ensembles" class="level2" data-number="31.8"><h2 data-number="31.8" class="anchored" data-anchor-id="ensembles">
+<span class="header-section-number">31.8</span> Ensembles</h2>
 <p>The idea of an ensemble is similar to the idea of combining data from different pollsters to obtain a better estimate of the true support for each candidate.</p>
 <p>In machine learning, one can usually greatly improve the final results by combining the results of different algorithms.</p>
-<p>Here is a simple example where we compute new class probabilities by taking the average of random forest and kNN. We can see that the accuracy improves to 0.96:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-37_f489fbe89e2d12dd9d9ef4a344227ffd">
-<div class="sourceCode" id="cb42"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p_rf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit_rf</span>, <span class="va">x_test</span><span class="op">[</span>,<span class="va">col_index</span><span class="op">]</span>, type <span class="op">=</span> <span class="st">"prob"</span><span class="op">)</span>  </span>
+<p>Here is a simple example where we compute new class probabilities by taking the average of random forest and kNN. We can see that the accuracy improves:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-41_b9ad0edbf0a4183902c231fff5d07650">
+<div class="sourceCode" id="cb43"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p_rf</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit_rf</span>, <span class="va">x_test</span><span class="op">[</span>,<span class="va">col_index</span><span class="op">]</span>, type <span class="op">=</span> <span class="st">"prob"</span><span class="op">)</span>  </span>
 <span><span class="va">p_rf</span> <span class="op">&lt;-</span> <span class="va">p_rf</span> <span class="op">/</span> <span class="fu"><a href="https://rdrr.io/r/base/colSums.html">rowSums</a></span><span class="op">(</span><span class="va">p_rf</span><span class="op">)</span></span>
-<span><span class="va">p_knn</span>  <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit_knn</span>, <span class="va">x_test</span><span class="op">[</span>,<span class="va">col_index</span><span class="op">]</span><span class="op">)</span></span>
-<span><span class="va">p</span> <span class="op">&lt;-</span> <span class="op">(</span><span class="va">p_rf</span> <span class="op">+</span> <span class="va">p_knn</span><span class="op">)</span><span class="op">/</span><span class="fl">2</span></span>
+<span><span class="va">p_knn_pca</span>  <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit_knn_pca</span>, <span class="va">newdata</span><span class="op">)</span></span>
+<span><span class="va">p</span> <span class="op">&lt;-</span> <span class="op">(</span><span class="va">p_rf</span> <span class="op">+</span> <span class="va">p_knn_pca</span><span class="op">)</span><span class="op">/</span><span class="fl">2</span></span>
 <span><span class="va">y_pred</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/apply.html">apply</a></span><span class="op">(</span><span class="va">p</span>, <span class="fl">1</span>, <span class="va">which.max</span><span class="op">)</span> <span class="op">-</span> <span class="fl">1</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_pred</span>, <span class="va">y_test</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[</span><span class="st">"Accuracy"</span><span class="op">]</span></span>
 <span><span class="co">#&gt; Accuracy </span></span>
-<span><span class="co">#&gt;    0.954</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>In the exercises we are going to build several machine learning models for the <code>mnist_27</code> dataset and then build an ensemble.</p>
-</section><section id="exercises" class="level2" data-number="30.9"><h2 data-number="30.9" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">30.9</span> Exercises</h2>
-<p>1. Previously in the book, we have compared conditional probability give two predictors <span class="math inline">\(p(x_1,x_2)\)</span> to the fit <span class="math inline">\(\hat{p}(x_1,x_2)\)</span> obtained with a machine learning algorithm by making image plots. The following code can be used to make these images and include a curve at the values of <span class="math inline">\(x_1\)</span> and <span class="math inline">\(x_2\)</span> for which the function is <span class="math inline">\(0.5\)</span>.</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-38_3cd0ebf5f5cd1f45bab56c2e6bdf4e71">
-<div class="sourceCode" id="cb43"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">plot_cond_prob</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span>, <span class="va">p</span><span class="op">)</span><span class="op">{</span></span>
+<span><span class="co">#&gt;    0.967</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>We have just built an ensemble with just two algorithms. By combing more similarly performing, but uncorrelated, algorithms we can improve accuracy further.</p>
+</section><section id="exercises" class="level2" data-number="31.9"><h2 data-number="31.9" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">31.9</span> Exercises</h2>
+<p>1. In the exercises in <a href="algorithms.html"><span>Chapter&nbsp;30</span></a> we saw that changing <code>maxnodes</code> or <code>nodesize</code> in the <code>randomForest</code> function improved our estimate. Let’s use the <code>train</code> function to help us pick these values. From the <strong>caret</strong> manual we see that we can’t tune the <code>maxnodes</code> parameter or the <code>nodesize</code> argument with <code>randomForest</code>, so we will use the <strong>Rborist</strong> package and tune the <code>minNode</code> argument. Use the <code>train</code> function to try values <code>minNode &lt;- seq(5, 250, 25)</code>. See which value minimizes the estimated RMSE.</p>
+<p>2. This **dslabs* dataset includes a matrix <code>x</code>:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-42_c2253b9d705297be37e99671d90a11d7">
+<div class="sourceCode" id="cb44"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/dim.html">dim</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>with the gene expression measured on 500 genes for 189 biological samples representing seven different tissues. The tissue type is stored in <code>y</code>:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-43_12ddadbb6898afaeeb65c55b7b8dcf99">
+<div class="sourceCode" id="cb45"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/table.html">table</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">y</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>Split the data in training and test sets, then use kNN to predict tissue type and see what accuracy you obtain. Try it for <span class="math inline">\(k = 1, 3, \dots, 11\)</span>.</p>
+<p>3. We are going to apply LDA and QDA to the <code>tissue_gene_expression</code> dataset. We will start with simple examples based on this dataset and then develop a realistic example.</p>
+<p>Create a dataset with just the classes <code>cerebellum</code> and <code>hippocampus</code> (two parts of the brain) and a predictor matrix with 10 randomly selected columns. Estimate the accuracy of LDA.</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-44_d105678275d3cc80ffd0f58620653bdb">
+<div class="sourceCode" id="cb46"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1993</span><span class="op">)</span></span>
+<span><span class="va">tissues</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"cerebellum"</span>, <span class="st">"hippocampus"</span><span class="op">)</span></span>
+<span><span class="va">ind</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/which.html">which</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">y</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">tissues</span><span class="op">)</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/droplevels.html">droplevels</a></span><span class="op">(</span><span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">y</span><span class="op">[</span><span class="va">ind</span><span class="op">]</span><span class="op">)</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">x</span><span class="op">[</span><span class="va">ind</span>, <span class="op">]</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[</span>, <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, <span class="fl">10</span><span class="op">)</span><span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>4. In this case, LDA fits two 10-dimensional normal distributions. Look at the fitted model by looking at the <code>finalModel</code> component of the result of train. Notice there is a component called <code>means</code> that includes the estimate <code>means</code> of both distributions. Plot the mean vectors against each other and determine which predictors (genes) appear to be driving the algorithm.</p>
+<p>5. Repeat exercises 3 with QDA. Does it have a higher accuracy than LDA?</p>
+<p>6. Are the same predictors (genes) driving the algorithm? Make a plot as in exercise 3.</p>
+<p>7. One thing we see in the previous plot is that the value of predictors correlate in both groups: some predictors are low in both groups while others are high in both groups. The mean value of each predictor, <code>colMeans(x)</code>, is not informative or useful for prediction and often, for interpretation purposes, it is useful to center or scale each column. This can be achieved with the <code>preProcessing</code> argument in <code>train</code>. Re-run LDA with <code>preProcessing = "scale"</code>. Note that accuracy does not change but see how it is easier to identify the predictors that differ more between groups in the plot made in exercise 4.</p>
+<p>8. In the previous exercises, we saw that both approaches worked well. Plot the predictor values for the two genes with the largest differences between the two groups in a scatterplot to see how they appear to follow a bivariate distribution as assumed by the LDA and QDA approaches. Color the points by the outcome.</p>
+<p>9. Now we are going to increase the complexity of the challenge slightly: we will consider all the tissue types.</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-45_d63f5c0f64a176ed60f9545330965172">
+<div class="sourceCode" id="cb47"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1993</span><span class="op">)</span></span>
+<span><span class="va">y</span> <span class="op">&lt;-</span> <span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">y</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">tissue_gene_expression</span><span class="op">$</span><span class="va">x</span></span>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">x</span><span class="op">[</span>, <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/nrow.html">ncol</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span>, <span class="fl">10</span><span class="op">)</span><span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>What accuracy do you get with LDA?</p>
+<p>10. We see that the results are slightly worse. Use the <code>confusionMatrix</code> function to learn what type of errors we are making.</p>
+<p>11. Plot an image of the centers of the seven 10-dimensional normal distributions.</p>
+<p>12. Make a scatterplot along with the prediction from the best fitted model.</p>
+<p>13. Use the <code>rpart</code> function to fit a classification tree to the <code>tissue_gene_expression</code> dataset. Use the <code>train</code> function to estimate the accuracy. Try out <code>cp</code> values of <code>seq(0, 0.05, 0.01)</code>. Plot the accuracy to report the results of the best model.</p>
+<p>14. Study the confusion matrix for the best fitting classification tree. What do you observe happening for placenta?</p>
+<p>15. Notice that placentas are called endometrium more often than placenta. Note also that the number of placentas is just six, and that, by default, <code>rpart</code> requires 20 observations before splitting a node. Thus it is not possible with these parameters to have a node in which placentas are the majority. Rerun the above analysis, but this time permit <code>rpart</code> to split any node by using the argument <code>control = rpart.control(minsplit = 0)</code>. Does the accuracy increase? Look at the confusion matrix again.</p>
+<p>16. Plot the tree from the best fitting model obtained in exercise 11.</p>
+<p>17. We can see that with just six genes, we are able to predict the tissue type. Now let’s see if we can do even better with a random forest. Use the <code>train</code> function and the <code>rf</code> method to train a random forest. Try out values of <code>mtry</code> ranging from, at least, <code>seq(50, 200, 25)</code>. What <code>mtry</code> value maximizes accuracy? To permit small <code>nodesize</code> to grow as we did with the classification trees, use the following argument: <code>nodesize = 1</code>. This will take several seconds to run. If you want to test it out, try using smaller values with <code>ntree</code>. Set the seed to 1990.</p>
+<p>18. Use the function <code>varImp</code> on the output of <code>train</code> and save it to an object called <code>imp</code>.</p>
+<p>19. The <code>rpart</code> model we ran above produced a tree that used just six predictors. Extracting the predictor names is not straightforward, but can be done. If the output of the call to train was <code>fit_rpart</code>, we can extract the names like this:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-46_06cd3ba436cccd8e839849ac6c18e2c1">
+<div class="sourceCode" id="cb48"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">ind</span> <span class="op">&lt;-</span> <span class="op">!</span><span class="op">(</span><span class="va">fit_rpart</span><span class="op">$</span><span class="va">finalModel</span><span class="op">$</span><span class="va">frame</span><span class="op">$</span><span class="va">var</span> <span class="op">==</span> <span class="st">"&lt;leaf&gt;"</span><span class="op">)</span></span>
+<span><span class="va">tree_terms</span> <span class="op">&lt;-</span> </span>
+<span>  <span class="va">fit_rpart</span><span class="op">$</span><span class="va">finalModel</span><span class="op">$</span><span class="va">frame</span><span class="op">$</span><span class="va">var</span><span class="op">[</span><span class="va">ind</span><span class="op">]</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://rdrr.io/r/base/unique.html">unique</a></span><span class="op">(</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://rdrr.io/r/base/character.html">as.character</a></span><span class="op">(</span><span class="op">)</span></span>
+<span><span class="va">tree_terms</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<p>What is the variable importance in the random forest call for these predictors? Where do they rank?</p>
+<p>20. Extract the top 50 predictors based on importance, take a subset of <code>x</code> with just these predictors and apply the function <code>heatmap</code> to see how these genes behave across the tissues. We will introduce the <code>heatmap</code> function in <a href="clustering.html"><span>Chapter&nbsp;32</span></a>.</p>
+<p>21. Previously, we compared the conditional probability <span class="math inline">\(p(\mathbf{x})\)</span> give two predictors <span class="math inline">\(\mathbf{x} = (x_1, x_2)^\top\)</span> to the fit <span class="math inline">\(\hat{p}(\mathbf{x})\)</span> obtained with a machine learning algorithm by making image plots. The following code can be used to make these images and include a curve at the values of <span class="math inline">\(x_1\)</span> and <span class="math inline">\(x_2\)</span> for which the function is <span class="math inline">\(0.5\)</span>:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-47_f3c8678c068060c9875ca48f5432e6a1">
+<div class="sourceCode" id="cb49"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">plot_cond_prob</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span>, <span class="va">p</span><span class="op">)</span><span class="op">{</span></span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span>x_1 <span class="op">=</span> <span class="va">x_1</span>, x_2 <span class="op">=</span> <span class="va">x_2</span>, p <span class="op">=</span> <span class="va">p</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>    <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
 <span>    <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_tile.html">geom_raster</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span>fill <span class="op">=</span> <span class="va">p</span><span class="op">)</span>, show.legend <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span> <span class="op">+</span></span>
@@ -791,40 +893,40 @@ <h1 class="title">
 <span><span class="op">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We can see the true conditional probability for the 2 or 7 example like this:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-39_1d4c3f8cd964cff295f290c30068b3d2">
-<div class="sourceCode" id="cb44"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">mnist_27</span><span class="op">$</span><span class="va">true_p</span>, <span class="fu">plot_cond_prob</span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-48_9e1c9609929d9f7d016d482584d5d96b">
+<div class="sourceCode" id="cb50"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">mnist_27</span><span class="op">$</span><span class="va">true_p</span>, <span class="fu">plot_cond_prob</span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Fit a kNN model and make this plot for the estimated conditional probability. Hint: Use the argument <code>newdata=mnist27$train</code> to obtain predictions for a grid points.</p>
-<p>2. Notice that, in the plot made in exercise 1, the boundary is somewhat wiggly. This is because kNN, like the basic bin smoother, does not use a kernel. To improve this we could try loess. By reading through the available models part of the manual<a href="#fn4" class="footnote-ref" id="fnref4" role="doc-noteref"><sup>4</sup></a> we see that we can use the <code>gamLoess</code> method. In the manual<a href="#fn5" class="footnote-ref" id="fnref5" role="doc-noteref"><sup>5</sup></a> we also see that we need to install the <strong>gam</strong> package if we have not done so already. We see that we have two parameters to optimize:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-40_cfd7c1cc0b9992621abbbb6d44695339">
-<div class="sourceCode" id="cb45"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/modelLookup.html">modelLookup</a></span><span class="op">(</span><span class="st">"gamLoess"</span><span class="op">)</span></span>
+<p>Fit a kNN model and make this plot for the estimated conditional probability. Hint: Use the argument <code>newdata = mnist27$train</code> to obtain predictions for a grid points.</p>
+<p>22. Notice that, in the plot made in exercise 1, the boundary is somewhat wiggly. This is because kNN, like the basic bin smoother, does not use a kernel. To improve this we could try loess. By reading through the available models part of the <strong>caret</strong> manual, we see that we can use the <code>gamLoess</code> method. We need to install the <strong>gam</strong> package, if we have not done so already. We see that we have two parameters to optimize:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-49_9be1c1741b6c0581a021e110ae5d409b">
+<div class="sourceCode" id="cb51"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/modelLookup.html">modelLookup</a></span><span class="op">(</span><span class="st">"gamLoess"</span><span class="op">)</span></span>
 <span><span class="co">#&gt;      model parameter  label forReg forClass probModel</span></span>
 <span><span class="co">#&gt; 1 gamLoess      span   Span   TRUE     TRUE      TRUE</span></span>
 <span><span class="co">#&gt; 2 gamLoess    degree Degree   TRUE     TRUE      TRUE</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Use cross-validation to pick a span between 0.15 and 0.75. Keep <code>degree = 1</code>. What span does cross-validation select?</p>
-<p>3. Show an image plot of the estimate <span class="math inline">\(\hat{p}(x,y)\)</span> resulting from the model fit in the exercise 2. How does the accuracy compare to that of kNN? Comment on the difference between the estimate obtained with kNN.</p>
-<p>4. Use the <code>mnist_27</code> training set to build a model with several of the models available from the <strong>caret</strong> package. For example, you can try these:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-41_7873ebd428b94aac9ef240a6c75dc968">
-<div class="sourceCode" id="cb46"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">models</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"glm"</span>, <span class="st">"lda"</span>,  <span class="st">"naive_bayes"</span>,  <span class="st">"svmLinear"</span>, <span class="st">"gamboost"</span>,  </span>
+<p>23. Show an image plot of the estimate <span class="math inline">\(\hat{p}(x,y)\)</span> resulting from the model fit in the exercise 2. How does the accuracy compare to that of kNN? Comment on the difference between the estimate obtained with kNN.</p>
+<p>24. Use the <code>mnist_27</code> training set to build a model with several of the models available from the <strong>caret</strong> package. For example, you can try these:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-50_bee7761a0704d6a927a20270bab3d932">
+<div class="sourceCode" id="cb52"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">models</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"glm"</span>, <span class="st">"lda"</span>,  <span class="st">"naive_bayes"</span>,  <span class="st">"svmLinear"</span>, <span class="st">"gamboost"</span>,  </span>
 <span>            <span class="st">"gamLoess"</span>, <span class="st">"qda"</span>, <span class="st">"knn"</span>, <span class="st">"kknn"</span>, <span class="st">"loclda"</span>, <span class="st">"gam"</span>, <span class="st">"rf"</span>, </span>
 <span>            <span class="st">"ranger"</span>,<span class="st">"wsrf"</span>, <span class="st">"Rborist"</span>, <span class="st">"avNNet"</span>, <span class="st">"mlp"</span>, <span class="st">"monmlp"</span>, <span class="st">"gbm"</span>, </span>
 <span>            <span class="st">"adaboost"</span>, <span class="st">"svmRadial"</span>, <span class="st">"svmRadialCost"</span>, <span class="st">"svmRadialSigma"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>We have not explained many of these, but apply them anyway using <code>train</code> with all the default parameters. Keep the results in a list. You might need to install some packages. Keep in mind that you will likely get some warnings.</p>
-<p>5. Now that you have all the trained models in a list, use <code>sapply</code> or <code>map</code> to create a matrix of predictions for the test set. You should end up with a matrix with <code>length(mnist_27$test$y)</code> rows and <code>length(models)</code> columns.</p>
-<p>6. Now compute accuracy for each model on the test set.</p>
-<p>7. Now build an ensemble prediction by majority vote and compute its accuracy.</p>
-<p>8. Earlier we computed the accuracy of each method on the training set and noticed they varied. Which individual methods do better than the ensemble?</p>
-<p>9. It is tempting to remove the methods that do not perform well and re-do the ensemble. The problem with this approach is that we are using the test data to make a decision. However, we could use the accuracy estimates obtained from cross validation with the training data. Obtain these estimates and save them in an object.</p>
-<p>10. Now let’s only consider the methods with an estimated accuracy of 0.8 when constructing the ensemble. What is the accuracy now?</p>
-<p>11. <strong>Advanced</strong>: If two methods give results that are the same, ensembling them will not change the results at all. For each pair of metrics compare the percent of time they call the same thing. Then use the <code>heatmap</code> function to visualize the results. Hint: use the <code>method = "binary"</code> argument in the <code>dist</code> function.</p>
-<p>12. <strong>Advanced</strong>: Note that each method can also produce an estimated conditional probability. Instead of majority vote we can take the average of these estimated conditional probabilities. For most methods, we can the use the <code>type = "prob"</code> in the train function. However, some of the methods require you to use the argument <code>trControl=trainControl(classProbs=TRUE)</code> when calling train. Also these methods do not work if classes have numbers as names. Hint: change the levels like this:</p>
-<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-42_738209a841e11b376e86712cc0d1e5c3">
-<div class="sourceCode" id="cb47"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fu">recode_factor</span><span class="op">(</span><span class="va">dat</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">y</span>, <span class="st">"2"</span><span class="op">=</span><span class="st">"two"</span>, <span class="st">"7"</span><span class="op">=</span><span class="st">"seven"</span><span class="op">)</span></span>
+<p>25. Now that you have all the trained models in a list, use <code>sapply</code> or <code>map</code> to create a matrix of predictions for the test set. You should end up with a matrix with <code>length(mnist_27$test$y)</code> rows and <code>length(models)</code> columns.</p>
+<p>26. Compute accuracy for each model on the test set.</p>
+<p>27. Build an ensemble prediction by majority vote and compute its accuracy.</p>
+<p>28. Earlier we computed the accuracy of each method on the training set and noticed they varied. Which individual methods do better than the ensemble?</p>
+<p>29. It is tempting to remove the methods that do not perform well and re-do the ensemble. The problem with this approach is that we are using the test data to make a decision. However, we could use the accuracy estimates obtained from cross validation with the training data. Obtain these estimates and save them in an object.</p>
+<p>30. Now let’s only consider the methods with an estimated accuracy of 0.8 when constructing the ensemble. What is the accuracy now?</p>
+<p>31. Note that if two machine algorithms methods predict the same outcome, ensembling them will not change the prediction. For each pair of algorithms compare the percent of observations for which they make the same prediction. Use this to define a function and then use the <code>heatmap</code> function to visualize the results. Hint: use the <code>method = "binary"</code> argument in the <code>dist</code> function.</p>
+<p>32. Note that each method can also produce an estimated conditional probability. Instead of majority vote, we can take the average of these estimated conditional probabilities. For most methods, we can the use the <code>type = "prob"</code> in the train function. Note that some of the methods require you to use the argument <code>trControl=trainControl(classProbs=TRUE)</code> when calling train. Also, these methods do not work if classes have numbers as names. Hint: change the levels like this:</p>
+<div class="cell" data-layout-align="center" data-hash="ml-in-practice_cache/html/unnamed-chunk-51_6e5197638f8fc07198f36be198939f3a">
+<div class="sourceCode" id="cb53"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">dat</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fu">recode_factor</span><span class="op">(</span><span class="va">dat</span><span class="op">$</span><span class="va">train</span><span class="op">$</span><span class="va">y</span>, <span class="st">"2"</span><span class="op">=</span><span class="st">"two"</span>, <span class="st">"7"</span><span class="op">=</span><span class="st">"seven"</span><span class="op">)</span></span>
 <span><span class="va">dat</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span> <span class="op">&lt;-</span> <span class="fu">recode_factor</span><span class="op">(</span><span class="va">dat</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span>, <span class="st">"2"</span><span class="op">=</span><span class="st">"two"</span>, <span class="st">"7"</span><span class="op">=</span><span class="st">"seven"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>13. In this chapter, we illustrated a couple of machine learning algorithms on a subset of the MNIST dataset. Try fitting a model to the entire dataset.</p>
+<p>33. In this chapter, we illustrated a couple of machine learning algorithms on a subset of the MNIST dataset. Try fitting a model to the entire dataset.</p>
 
 
 </section><section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes"><hr>
@@ -832,8 +934,6 @@ <h1 class="title">
 <li id="fn1"><p>https://topepo.github.io/caret/available-models.html<a href="#fnref1" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
 <li id="fn2"><p>http://topepo.github.io/caret/available-models.html<a href="#fnref2" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
 <li id="fn3"><p>https://topepo.github.io/caret/available-models.html<a href="#fnref3" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
-<li id="fn4"><p>https://topepo.github.io/caret/available-models.html<a href="#fnref4" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
-<li id="fn5"><p>https://topepo.github.io/caret/train-models-by-tag.html<a href="#fnref5" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
 </ol></section></main><!-- /main --><script id="quarto-html-after-body" type="application/javascript">
 window.document.addEventListener("DOMContentLoaded", function (event) {
   const toggleBodyColorMode = (bsSheetEl) => {
@@ -1068,12 +1168,12 @@ <h1 class="title">
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../ml/algorithms.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../ml/clustering.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/ml/ml-in-practice_files/figure-html/caret-highlight-1.png b/docs/ml/ml-in-practice_files/figure-html/caret-highlight-1.png
index d1ab22f..9284079 100644
Binary files a/docs/ml/ml-in-practice_files/figure-html/caret-highlight-1.png and b/docs/ml/ml-in-practice_files/figure-html/caret-highlight-1.png differ
diff --git a/docs/ml/ml-in-practice_files/figure-html/cv-10-fold-accuracy-estimate-1.png b/docs/ml/ml-in-practice_files/figure-html/cv-10-fold-accuracy-estimate-1.png
index 3967b85..aebb49e 100644
Binary files a/docs/ml/ml-in-practice_files/figure-html/cv-10-fold-accuracy-estimate-1.png and b/docs/ml/ml-in-practice_files/figure-html/cv-10-fold-accuracy-estimate-1.png differ
diff --git a/docs/ml/ml-in-practice_files/figure-html/importance-image-1.png b/docs/ml/ml-in-practice_files/figure-html/importance-image-1.png
index 7735fc9..93e171f 100644
Binary files a/docs/ml/ml-in-practice_files/figure-html/importance-image-1.png and b/docs/ml/ml-in-practice_files/figure-html/importance-image-1.png differ
diff --git a/docs/ml/ml-in-practice_files/figure-html/rf-images,-1.png b/docs/ml/ml-in-practice_files/figure-html/rf-images,-1.png
index 9fb72f7..282e66e 100644
Binary files a/docs/ml/ml-in-practice_files/figure-html/rf-images,-1.png and b/docs/ml/ml-in-practice_files/figure-html/rf-images,-1.png differ
diff --git a/docs/ml/ml-in-practice_files/figure-html/train-knn-plot-1.png b/docs/ml/ml-in-practice_files/figure-html/train-knn-plot-1.png
index 700763f..e980b2d 100644
Binary files a/docs/ml/ml-in-practice_files/figure-html/train-knn-plot-1.png and b/docs/ml/ml-in-practice_files/figure-html/train-knn-plot-1.png differ
diff --git a/docs/ml/ml-in-practice_files/figure-html/unnamed-chunk-34-1.png b/docs/ml/ml-in-practice_files/figure-html/unnamed-chunk-34-1.png
deleted file mode 100644
index 850a395..0000000
Binary files a/docs/ml/ml-in-practice_files/figure-html/unnamed-chunk-34-1.png and /dev/null differ
diff --git a/docs/ml/notation-and-terminology.html b/docs/ml/notation-and-terminology.html
index f64488f..c77e287 100644
--- a/docs/ml/notation-and-terminology.html
+++ b/docs/ml/notation-and-terminology.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 24&nbsp; Notation and Terminology</title>
+<title>Advanced Data Science - 25&nbsp; Notation and terminology</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -64,7 +64,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/notation-and-terminology.html"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/notation-and-terminology.html"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -189,23 +189,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -222,37 +228,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -269,31 +275,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -310,49 +316,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -365,15 +371,15 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#terminology" id="toc-terminology" class="nav-link active" data-scroll-target="#terminology"><span class="header-section-number">24.1</span> Terminology</a></li>
-  <li><a href="#notation" id="toc-notation" class="nav-link" data-scroll-target="#notation"><span class="header-section-number">24.2</span> Notation</a></li>
-  <li><a href="#the-machine-learning-challenge" id="toc-the-machine-learning-challenge" class="nav-link" data-scroll-target="#the-machine-learning-challenge"><span class="header-section-number">24.3</span> The machine learning challenge</a></li>
+<li><a href="#terminology" id="toc-terminology" class="nav-link active" data-scroll-target="#terminology"><span class="header-section-number">25.1</span> Terminology</a></li>
+  <li><a href="#notation" id="toc-notation" class="nav-link" data-scroll-target="#notation"><span class="header-section-number">25.2</span> Notation</a></li>
+  <li><a href="#the-machine-learning-challenge" id="toc-the-machine-learning-challenge" class="nav-link" data-scroll-target="#the-machine-learning-challenge"><span class="header-section-number">25.3</span> The machine learning challenge</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/ml/notation-and-terminology.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
 <h1 class="title">
-<span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span>
+<span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span>
 </h1>
 </div>
 
@@ -387,19 +393,19 @@ <h1 class="title">
   </div>
   
 
-</header><p>In Section <span class="math inline">\(\ref{@mnist}\)</span> we introduced the MNIST handwritten digits dataset. Here we describe how the task of automatically reading these digits can be framed as a machine learning challenge and in doing so we introduce machine learning mathematical notation and terminology used throughout this part of the book.</p>
-<p>Originally, sorting mail received in the post office involved humans reading zip codes, written on the envelops. Today, thanks to machine learning algorithms, a computer can read zip codes and then a robot sorts the letters. We will learn how to build algorithms that can read a digitized handwritten digit.</p>
-<section id="terminology" class="level2" data-number="24.1"><h2 data-number="24.1" class="anchored" data-anchor-id="terminology">
-<span class="header-section-number">24.1</span> Terminology</h2>
+</header><p>In <span class="math inline">\(\ref{@mnist}\)</span>, we introduced the MNIST handwritten digits dataset. Here we describe how the task of automatically reading these digits can be framed as a machine learning challenge. In doing so, we introduce machine learning mathematical notation and terminology used throughout this part of the book.</p>
+<p>Originally, mail sorting in the post office involved humans reading zip codes written on the envelopes. Today, thanks to machine learning algorithms, a computer can read zip codes and then a robot sorts the letters. We will learn how to build algorithms that can read a digitized handwritten digit.</p>
+<section id="terminology" class="level2" data-number="25.1"><h2 data-number="25.1" class="anchored" data-anchor-id="terminology">
+<span class="header-section-number">25.1</span> Terminology</h2>
 <p>In machine learning, data comes in the form of the <em>outcome</em> we want to predict and the <em>features</em> that we will use to predict the outcome. We build algorithms that take feature values as input and returns a prediction for the outcome when we don’t know the outcome. The machine learning approach is to <em>train</em> an algorithm using a dataset for which we do know the outcome, and then apply this algorithm in the future to make a prediction when we don’t know the outcome.</p>
 <p>Prediction problems can be divided into categorical and continuous outcomes. For categorical outcomes, <span class="math inline">\(Y\)</span> can be any one of <span class="math inline">\(K\)</span> classes. The number of classes can vary greatly across applications. For example, in the digit reader data, <span class="math inline">\(K=10\)</span> with the classes being the digits 0, 1, 2, 3, 4, 5, 6, 7, 8, and 9. In speech recognition, the outcomes are all possible words or phrases we are trying to detect. Spam detection has two outcomes: spam or not spam. In this book, we denote the <span class="math inline">\(K\)</span> categories with indexes <span class="math inline">\(k=1,\dots,K\)</span>. However, for binary data we will use <span class="math inline">\(k=0,1\)</span> for mathematical conveniences that we demonstrate later.</p>
-</section><section id="notation" class="level2" data-number="24.2"><h2 data-number="24.2" class="anchored" data-anchor-id="notation">
-<span class="header-section-number">24.2</span> Notation</h2>
+</section><section id="notation" class="level2" data-number="25.2"><h2 data-number="25.2" class="anchored" data-anchor-id="notation">
+<span class="header-section-number">25.2</span> Notation</h2>
 <p>Here we will use <span class="math inline">\(Y\)</span> to denote the outcome and <span class="math inline">\(X_1, \dots, X_p\)</span> to denote features. Note that features are sometimes referred to as predictors or covariates. We consider all these to be synonyms.</p>
-<p>The first step in building an algorithm is to understand what are the outcomes and features. In Section <a href="../highdim/matrices-in-R.html#sec-mnist"><span>Section&nbsp;19.1</span></a> we showed that associated with each digitized image <span class="math inline">\(i\)</span>, there is a categorical outcome <span class="math inline">\(Y_i\)</span> and features <span class="math inline">\(X_{i,1}, \dots, X_{i,p}\)</span>, with <span class="math inline">\(p=784\)</span>. We use bold face <span class="math inline">\(\mathbf{X}_i = (X_{i,1}, \dots, X_{i,p})\)</span> to denote the vector of predictors. Note that we are using the matrix notation described in <a href="../highdim/matrices-in-R.html#sec-matrix-notation"><span>Section&nbsp;19.2</span></a>. When referring to an arbitrary set of features rather than a specific image, we drop the index <span class="math inline">\(i\)</span> and use <span class="math inline">\(Y\)</span> and <span class="math inline">\(\mathbf{X} = (X_{1}, \dots, X_{p})\)</span>. We use upper case variables because, in general, we think of the outcome and predictors as random variables. We use lower case, for example <span class="math inline">\(\mathbf{X} = \mathbf{x}\)</span>, to denote observed values. Although, when we code, we stick to lower case.</p>
-<p>The machine learning task is to build an algorithm that returns a prediction for any of the possible values of the features. Here, we will learn several approaches to building these algorithms. Although at this point it might seem impossible to achieve this, we will start with simple examples and build up our knowledge until we can attack more complex ones. In fact, we start with an artificially simple example with just one predictor and then move on to a slightly more realistic example with two predictors. Once we understand these, we will attack real-world machine learning challenges involving many predictors.</p>
-</section><section id="the-machine-learning-challenge" class="level2" data-number="24.3"><h2 data-number="24.3" class="anchored" data-anchor-id="the-machine-learning-challenge">
-<span class="header-section-number">24.3</span> The machine learning challenge</h2>
+<p>The first step in building an algorithm is to understand what are the outcomes and features. In <a href="../highdim/matrices-in-R.html#sec-mnist"><span>Section&nbsp;20.1</span></a>, we showed that associated with each digitized image <span class="math inline">\(i\)</span>, there is a categorical outcome <span class="math inline">\(Y_i\)</span> and features <span class="math inline">\(X_{i,1}, \dots, X_{i,p}\)</span>, with <span class="math inline">\(p=784\)</span>. We use bold face <span class="math inline">\(\mathbf{X}_i = (X_{i,1}, \dots, X_{i,p})\)</span> to denote the vector of predictors. Notice that we are using the matrix notation described in <a href="../highdim/matrices-in-R.html#sec-matrix-notation"><span>Section&nbsp;20.5</span></a>. When referring to an arbitrary set of features rather than a specific image, we drop the index <span class="math inline">\(i\)</span> and use <span class="math inline">\(Y\)</span> and <span class="math inline">\(\mathbf{X} = (X_{1}, \dots, X_{p})\)</span>. We use upper case variables because, in general, we think of the outcome and predictors as random variables. We use lower case, for example <span class="math inline">\(\mathbf{X} = \mathbf{x}\)</span>, to denote observed values. Although, when we code, we adhere to lower case.</p>
+<p>The machine learning task is to build an algorithm that returns a prediction for any of the possible values of the features. Here, we will learn several approaches to building these algorithms. Although at this point it might seem impossible to achieve this, we will start with basic examples and build up our knowledge until we can tackle more complex ones. In fact, we start with an artificially simple example with just one predictor and then move on to a slightly more realistic example with two predictors. Once we understand these, we will address real-world machine learning challenges involving many predictors.</p>
+</section><section id="the-machine-learning-challenge" class="level2" data-number="25.3"><h2 data-number="25.3" class="anchored" data-anchor-id="the-machine-learning-challenge">
+<span class="header-section-number">25.3</span> The machine learning challenge</h2>
 <p>The general setup is as follows. We have a series of features and an unknown outcome we want to predict:</p>
 <div class="cell" data-layout-align="center" data-hash="notation-and-terminology_cache/html/unnamed-chunk-1_30a4ab9fd49ccdfdd26dd085eb7d6098">
 <div class="cell-output-display">
@@ -472,9 +478,9 @@ <h1 class="title">
 </table>
 </div>
 </div>
-<p>When the output is continuous we refer to the machine learning task as <em>prediction</em>, and the main output of the model is a function <span class="math inline">\(f\)</span> that automatically produces a prediction, denoted with <span class="math inline">\(\hat{y}\)</span>, for any set of predictors: <span class="math inline">\(\hat{y} = f(x_1, x_2, \dots, x_p)\)</span>. We use the term <em>actual outcome</em> to denote what we ended up observing. So we want the prediction <span class="math inline">\(\hat{y}\)</span> to match the actual outcome <span class="math inline">\(y\)</span> as well as possible. Because our outcome is continuous, our predictions <span class="math inline">\(\hat{y}\)</span> will not be either exactly right or wrong, but instead we will determine an <em>error</em> defined as the difference between the prediction and the actual outcome <span class="math inline">\(y - \hat{y}\)</span>.</p>
-<p>When the outcome is categorical, we refer to the machine learning task as <em>classification</em>, and the main output of the model will be a <em>decision rule</em> which prescribes which of the <span class="math inline">\(K\)</span> classes we should predict. In this scenario, most models provide functions of the predictors for each class <span class="math inline">\(k\)</span>, <span class="math inline">\(f_k(x_1, x_2, \dots, x_p)\)</span>, that are used to make this decision. When the data is binary a typical decision rules looks like this: if <span class="math inline">\(f_1(x_1, x_2, \dots, x_p) &gt; C\)</span>, predict category 1, if not the other category, with <span class="math inline">\(C\)</span> a predetermined cutoff. Because the outcomes are categorical, our predictions will be either right or wrong.</p>
-<p>Notice that these terms vary among courses, text books, and other publications. Often <em>prediction</em> is used for both categorical and continuous outcomes, and the term <em>regression</em> can be used for the continuous case. Here we avoid using <em>regression</em> to avoid confusion with our previous use of the term <em>linear regression</em>. In most cases it will be clear if our outcomes are categorical or continuous, so we will avoid using these terms when possible.</p>
+<p>When the output is continuous, we refer to the machine learning task as <em>prediction</em>, and the main output of the model is a function <span class="math inline">\(f\)</span> that automatically produces a prediction, denoted with <span class="math inline">\(\hat{y}\)</span>, for any set of predictors: <span class="math inline">\(\hat{y} = f(x_1, x_2, \dots, x_p)\)</span>. We use the term <em>actual outcome</em> to denote what we end up observing. So we want the prediction <span class="math inline">\(\hat{y}\)</span> to match the actual outcome <span class="math inline">\(y\)</span> as best as possible. Because our outcome is continuous, our predictions <span class="math inline">\(\hat{y}\)</span> will not be either exactly right or wrong, but instead we will determine an <em>error</em> defined as the difference between the prediction and the actual outcome <span class="math inline">\(y - \hat{y}\)</span>.</p>
+<p>When the outcome is categorical, we refer to the machine learning task as <em>classification</em>, and the main output of the model will be a <em>decision rule</em> which prescribes which of the <span class="math inline">\(K\)</span> classes we should predict. In this scenario, most models provide functions of the predictors for each class <span class="math inline">\(k\)</span>, <span class="math inline">\(f_k(x_1, x_2, \dots, x_p)\)</span>, that are used to make this decision. When the data is binary, a typical decision rules looks like this: if <span class="math inline">\(f_1(x_1, x_2, \dots, x_p) &gt; C\)</span>, predict category 1, if not the other category, with <span class="math inline">\(C\)</span> a predetermined cutoff. Because the outcomes are categorical, our predictions will be either right or wrong.</p>
+<p>Notice that these terms vary among courses, textbooks, and other publications. Often <em>prediction</em> is used for both categorical and continuous outcomes, and the term <em>regression</em> can be used for the continuous case. Here we avoid using <em>regression</em> to avoid confusion with our previous use of the term <em>linear regression</em>. In most cases, it will be clear if our outcomes are categorical or continuous, so we will avoid using these terms when possible.</p>
 
 
 </section></main><!-- /main --><script id="quarto-html-after-body" type="application/javascript">
@@ -716,7 +722,7 @@ <h1 class="title">
   </div>
   <div class="nav-page nav-page-next">
       <a href="../ml/evaluation-metrics.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/ml/smoothing.html b/docs/ml/smoothing.html
index d9dcf3d..83b7586 100644
--- a/docs/ml/smoothing.html
+++ b/docs/ml/smoothing.html
@@ -5,7 +5,7 @@
 <meta name="generator" content="quarto-1.3.353">
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 <meta name="author" content="Rafael A. Irizarry">
-<title>Advanced Data Science - 27&nbsp; Smoothing</title>
+<title>Advanced Data Science - 28&nbsp; Smoothing</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -98,7 +98,7 @@
       <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
         <i class="bi bi-layout-text-sidebar-reverse"></i>
       </button>
-      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/smoothing.html"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></a></li></ol></nav>
+      <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../ml/intro-ml.html">Machine Learning</a></li><li class="breadcrumb-item"><a href="../ml/smoothing.html"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></a></li></ol></nav>
       <a class="flex-grow-1" role="button" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
       </a>
       <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,25 +405,25 @@
         <nav id="TOC" role="doc-toc" class="toc-active"><h2 id="toc-title">Table of contents</h2>
    
   <ul>
-<li><a href="#sec-two-or-seven" id="toc-sec-two-or-seven" class="nav-link active" data-scroll-target="#sec-two-or-seven"><span class="header-section-number">27.1</span> Simplified MNIST: Is it a 2 or a 7?</a></li>
-  <li><a href="#signal-plus-noise-model" id="toc-signal-plus-noise-model" class="nav-link" data-scroll-target="#signal-plus-noise-model"><span class="header-section-number">27.2</span> Signal plus noise model</a></li>
-  <li><a href="#bin-smoothing" id="toc-bin-smoothing" class="nav-link" data-scroll-target="#bin-smoothing"><span class="header-section-number">27.3</span> Bin smoothing</a></li>
-  <li><a href="#kernels" id="toc-kernels" class="nav-link" data-scroll-target="#kernels"><span class="header-section-number">27.4</span> Kernels</a></li>
+<li><a href="#sec-two-or-seven" id="toc-sec-two-or-seven" class="nav-link active" data-scroll-target="#sec-two-or-seven"><span class="header-section-number">28.1</span> Example: Is it a 2 or a 7?</a></li>
+  <li><a href="#signal-plus-noise-model" id="toc-signal-plus-noise-model" class="nav-link" data-scroll-target="#signal-plus-noise-model"><span class="header-section-number">28.2</span> Signal plus noise model</a></li>
+  <li><a href="#bin-smoothing" id="toc-bin-smoothing" class="nav-link" data-scroll-target="#bin-smoothing"><span class="header-section-number">28.3</span> Bin smoothing</a></li>
+  <li><a href="#kernels" id="toc-kernels" class="nav-link" data-scroll-target="#kernels"><span class="header-section-number">28.4</span> Kernels</a></li>
   <li>
-<a href="#local-weighted-regression-loess" id="toc-local-weighted-regression-loess" class="nav-link" data-scroll-target="#local-weighted-regression-loess"><span class="header-section-number">27.5</span> Local weighted regression (loess)</a>
+<a href="#local-weighted-regression-loess" id="toc-local-weighted-regression-loess" class="nav-link" data-scroll-target="#local-weighted-regression-loess"><span class="header-section-number">28.5</span> Local weighted regression (loess)</a>
   <ul class="collapse">
-<li><a href="#fitting-parabolas" id="toc-fitting-parabolas" class="nav-link" data-scroll-target="#fitting-parabolas"><span class="header-section-number">27.5.1</span> Fitting parabolas</a></li>
-  <li><a href="#beware-of-default-smoothing-parameters" id="toc-beware-of-default-smoothing-parameters" class="nav-link" data-scroll-target="#beware-of-default-smoothing-parameters"><span class="header-section-number">27.5.2</span> Beware of default smoothing parameters</a></li>
+<li><a href="#fitting-parabolas" id="toc-fitting-parabolas" class="nav-link" data-scroll-target="#fitting-parabolas"><span class="header-section-number">28.5.1</span> Fitting parabolas</a></li>
+  <li><a href="#beware-of-default-smoothing-parameters" id="toc-beware-of-default-smoothing-parameters" class="nav-link" data-scroll-target="#beware-of-default-smoothing-parameters"><span class="header-section-number">28.5.2</span> Beware of default smoothing parameters</a></li>
   </ul>
 </li>
-  <li><a href="#sec-smoothing-ml-connection" id="toc-sec-smoothing-ml-connection" class="nav-link" data-scroll-target="#sec-smoothing-ml-connection"><span class="header-section-number">27.6</span> Connecting smoothing to machine learning</a></li>
-  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">27.7</span> Exercises</a></li>
+  <li><a href="#sec-smoothing-ml-connection" id="toc-sec-smoothing-ml-connection" class="nav-link" data-scroll-target="#sec-smoothing-ml-connection"><span class="header-section-number">28.6</span> Connecting smoothing to machine learning</a></li>
+  <li><a href="#exercises" id="toc-exercises" class="nav-link" data-scroll-target="#exercises"><span class="header-section-number">28.7</span> Exercises</a></li>
   </ul><div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.com/rafalab/dsbook-part-2/blob/main/ml/smoothing.qmd" class="toc-action">View source</a></p><p><a href="https://github.com/rafalab/dsbook-part-2/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
     </div>
 <!-- main -->
 <main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block default"><div class="quarto-title">
 <h1 class="title">
-<span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span>
+<span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span>
 </h1>
 </div>
 
@@ -432,7 +438,7 @@ <h1 class="title">
   
 
 </header><p>Before continuing learning about machine learning algorithms, we introduce the important concept of <em>smoothing</em>. Smoothing is a very powerful technique used all across data analysis. Other names given to this technique are <em>curve fitting</em> and <em>low pass filtering</em>. It is designed to detect trends in the presence of noisy data in cases in which the shape of the trend is unknown. The <em>smoothing</em> name comes from the fact that to accomplish this feat, we assume that the trend is <em>smooth</em>, as in a smooth surface. In contrast, the noise, or deviation from the trend, is unpredictably wobbly:</p>
-<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/signal-plus-noise-example_e13165cc30f81277c3ba68a6b7c52928">
+<div class="cell" data-layout-align="center">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="smoothing_files/figure-html/signal-plus-noise-example-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -441,12 +447,12 @@ <h1 class="title">
 </div>
 </div>
 <p>Part of what we explain in this section are the assumptions that permit us to extract the trend from the noise.</p>
-<section id="sec-two-or-seven" class="level2" data-number="27.1"><h2 data-number="27.1" class="anchored" data-anchor-id="sec-two-or-seven">
-<span class="header-section-number">27.1</span> Simplified MNIST: Is it a 2 or a 7?</h2>
-<p>To motivate the need for smoothing and make the connection with machine learning, we will construct a simplifyed version of the MNIST dataset with just two classes for the outcome and two predictors. Specifically, we define the challenge as building an algorithm that can determine if a digit is a 2 or 7 from the proportion of dark pixels in the upper left quadrant (<span class="math inline">\(X_1\)</span>) and the lower right quadrant (<span class="math inline">\(X_2\)</span>). We also selected a random sample of 1,000 digits, 500 in the training set and 500 in the test set. We provide this dataset in the <code>dslabs</code> package:</p>
+<section id="sec-two-or-seven" class="level2" data-number="28.1"><h2 data-number="28.1" class="anchored" data-anchor-id="sec-two-or-seven">
+<span class="header-section-number">28.1</span> Example: Is it a 2 or a 7?</h2>
+<p>To motivate the need for smoothing and make the connection with machine learning, we will construct a simplified version of the MNIST dataset with just two classes for the outcome and two predictors. Specifically, we define the challenge as building an algorithm that can determine if a digit is a 2 or 7 from the proportion of dark pixels in the upper left quadrant (<span class="math inline">\(X_1\)</span>) and the lower right quadrant (<span class="math inline">\(X_2\)</span>). We also selected a random sample of 1,000 digits, 500 in the training set and 500 in the test set. We provide this dataset in the <em>mnist_27</em> object in the <code>dslabs</code> package.</p>
+<p>For the training data, we have <span class="math inline">\(n=500\)</span> observed outcomes <span class="math inline">\(y_1,\dots,y_n\)</span>, with <span class="math inline">\(Y\)</span> defined as <span class="math inline">\(1\)</span> if the digit is 7 and 0 if it’s 2, and <span class="math inline">\(n=500\)</span> features <span class="math inline">\(\mathbf{x}_1, \dots, \mathbf{x}_n\)</span>, with each feature a two-dimensional point <span class="math inline">\(\mathbf{x}_i = (x_{i,1}, x_{i,2})^\top\)</span>. Here is a plot of the <span class="math inline">\(x_2\)</span>s versus the <span class="math inline">\(x_1\)</span>s with color determining if <span class="math inline">\(y\)</span> is 1 (blue) or 0 (red):</p>
 <div class="cell" data-layout-align="center">
-<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
-<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span>
+<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/topepo/caret/">caret</a></span><span class="op">)</span></span>
 <span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
 <span><span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span>, color <span class="op">=</span> <span class="va">y</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
@@ -456,8 +462,8 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>We can immediately see some patterns. For example, if <span class="math inline">\(X_1\)</span> (the upper left panel) is very large, then the digit is probably a 7. Also, for smaller values of <span class="math inline">\(X_1\)</span>, the 2s appear to be in the mid range values of <span class="math inline">\(X_2\)</span>.</p>
-<p>To illustrate how to interpret <span class="math inline">\(X_1\)</span> and <span class="math inline">\(X_2\)</span>, we include four example images. On the left are the original images of the two digits with the largest and smallest values for <span class="math inline">\(X_1\)</span> and on the right we have the images corresponding to the largest and smallest values of <span class="math inline">\(X_2\)</span>:</p>
+<p>We can immediately see some patterns. For example, if <span class="math inline">\(x_1\)</span> (the upper left panel) is very large, then the digit is probably a 7. Also, for smaller values of <span class="math inline">\(x_1\)</span>, the 2s appear to be in the mid range values of <span class="math inline">\(x_2\)</span>.</p>
+<p>To illustrate how to interpret <span class="math inline">\(x_1\)</span> and <span class="math inline">\(x_2\)</span>, we include four example images. On the left are the original images of the two digits with the largest and smallest values for <span class="math inline">\(x_1\)</span> and on the right we have the images corresponding to the largest and smallest values of <span class="math inline">\(x_2\)</span>:</p>
 <div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/two-or-seven-images-large-x1_8e54053ad25992d7af9f6570c7ca40d7">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -469,28 +475,13 @@ <h1 class="title">
 <p>We can start getting a sense for why these predictors are useful, but also why the problem will be somewhat challenging.</p>
 <p>We haven’t really learned any algorithms yet, so let’s try building an algorithm using multivariable regression. The model is simply:</p>
 <p><span class="math display">\[
-p(x_1, x_2) = \mbox{Pr}(Y=1 \mid X_1=x_1 , X_2 = x_2) =
+p(\mathbf{x}) = \mbox{Pr}(Y=1 \mid \mathbf{X}=\mathbf{x}) = \mbox{Pr}(Y=1 \mid X_1=x_1 , X_2 = x_2) =
 \beta_0 + \beta_1 x_1 + \beta_2 x_2
 \]</span></p>
-<p>We fit it like this:</p>
-<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/unnamed-chunk-2_e8d8419f20bc2aaf6ad6aee549986f2d">
-<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>y <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">y</span> <span class="op">==</span> <span class="fl">7</span>, <span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://rdrr.io/r/stats/lm.html">lm</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">x_1</span> <span class="op">+</span> <span class="va">x_2</span>, data <span class="op">=</span> <span class="va">_</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>We can now build a decision rule based on the estimate of <span class="math inline">\(\hat{p}(x_1, x_2)\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/unnamed-chunk-3_31f1592144822f41d6eb5e1440d85a57">
-<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/predict.html">predict</a></span><span class="op">(</span><span class="va">fit</span>, newdata <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">)</span></span>
-<span><span class="va">y_hat</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/factor.html">factor</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">p_hat</span> <span class="op">&gt;</span> <span class="fl">0.5</span>, <span class="fl">7</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
-<span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/confusionMatrix.html">confusionMatrix</a></span><span class="op">(</span><span class="va">y_hat</span>, <span class="va">mnist_27</span><span class="op">$</span><span class="va">test</span><span class="op">$</span><span class="va">y</span><span class="op">)</span><span class="op">$</span><span class="va">overall</span><span class="op">[[</span><span class="st">"Accuracy"</span><span class="op">]</span><span class="op">]</span></span>
-<span><span class="co">#&gt; [1] 0.75</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>We get an accuracy well above 50%. Not bad for our first try. But can we do better?</p>
-<p>Because we constructed the <code>mnist_27</code> example and we had at our disposal 60,000 digits in just the MNIST dataset, we used this to build the <em>true</em> conditional distribution <span class="math inline">\(p(x_1, x_2)\)</span>. Keep in mind that this is something we don’t have access to in practice, but we include it in this example because it permits the comparison of <span class="math inline">\(\hat{p}(x_1, x_2)\)</span> to the true <span class="math inline">\(p(x_1, x_2)\)</span>. This comparison teaches us the limitations of different algorithms. Let’s do that here. We have stored the true <span class="math inline">\(p(x_1,x_2)\)</span> in the <code>mnist_27</code> object and can plot the image using the <strong>ggplot2</strong> function <code><a href="https://ggplot2.tidyverse.org/reference/geom_tile.html">geom_raster()</a></code>. We choose better colors and use the <code>stat_contour</code> function to draw a curve that separates pairs <span class="math inline">\((x_1,x_2)\)</span> for which <span class="math inline">\(p(x_1,x_2) &gt; 0.5\)</span> and pairs for which <span class="math inline">\(p(x_1,x_2) &lt; 0.5\)</span>:</p>
-<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/true-p-better-colors_d6af620b6c1b87f786e428b353e82145">
-<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mnist_27</span><span class="op">$</span><span class="va">true_p</span> <span class="op">|&gt;</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">x_1</span>, <span class="va">x_2</span>, z <span class="op">=</span> <span class="va">p</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_tile.html">geom_raster</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span>fill <span class="op">=</span> <span class="va">p</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/scale_gradient.html">scale_fill_gradientn</a></span><span class="op">(</span>colors <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"#F8766D"</span>, <span class="st">"white"</span>, <span class="st">"#00BFC4"</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_contour.html">stat_contour</a></span><span class="op">(</span>breaks <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0.5</span><span class="op">)</span>, color <span class="op">=</span> <span class="st">"black"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>We fit can fit this model using least squares and obtain an estimate <span class="math inline">\(\hat{p}(\mathbf{x})\)</span> by using the least square estimates <span class="math inline">\(\hat{\beta}_0\)</span>, <span class="math inline">\(\hat{\beta}_1\)</span> and <span class="math inline">\(\hat{\beta}_2\)</span>. We define a decision rule by predicting <span class="math inline">\(\hat{y}=1\)</span> if <span class="math inline">\(\hat{p}(\mathbf{x})&gt;0.5\)</span> and 0 otherwise.</p>
+<p>We get an accuracy of 0.775, well above 50%. Not bad for our first try. But can we do better?</p>
+<p>Because we constructed the <code>mnist_27</code> example and we had at our disposal 60,000 digits in just the MNIST dataset, we used this to build the <em>true</em> conditional distribution <span class="math inline">\(p(\mathbf{x})\)</span>. Keep in mind that in practice we don’t have access to the true conditional distribution. We include it in this educational example because it permits the comparison of <span class="math inline">\(\hat{p}(\mathbf{x})\)</span> to the true <span class="math inline">\(p(\mathbf{x})\)</span>. This comparison teaches us the limitations of different algorithms. We have stored the true <span class="math inline">\(p(\mathbf{x})\)</span> in the <code>mnist_27</code> and can plot it as an image. We draw a curve that separates pairs <span class="math inline">\((\mathbf{x})\)</span> for which <span class="math inline">\(p(\mathbf{x}) &gt; 0.5\)</span> and pairs for which <span class="math inline">\(p(\mathbf{x}) &lt; 0.5\)</span>:</p>
+<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/true-p-better-colors_8d4d1c4cf4d260ba18bd99930aca358e">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="smoothing_files/figure-html/true-p-better-colors-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -498,13 +489,13 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Above you see a plot of the true <span class="math inline">\(p(x_1, x_2)\)</span>. To start understanding the limitations of regression here, first note that with regression <span class="math inline">\(\hat{p}(x_1,x_2)\)</span> has to be a plane, and as a result the boundary defined by the decision rule is given by: <span class="math inline">\(\hat{p}(x_1,x_2) = 0.5\)</span>:</p>
+<p>To start understanding the limitations of regression, first note that with regression <span class="math inline">\(\hat{p}(\mathbf{x})\)</span> has to be a plane, and as a result the boundary defined by the decision rule is given by: <span class="math inline">\(\hat{p}(\mathbf{x}) = 0.5\)</span>:</p>
 <p><span class="math display">\[
 \hat{\beta}_0 + \hat{\beta}_1 x_1 + \hat{\beta}_2 x_2 = 0.5 \implies
 \hat{\beta}_0 + \hat{\beta}_1 x_1 + \hat{\beta}_2 x_2 = 0.5  \implies
 x_2 = (0.5-\hat{\beta}_0)/\hat{\beta}_2  -\hat{\beta}_1/\hat{\beta}_2 x_1
 \]</span></p>
-<p>Note that for this boundary, <span class="math inline">\(x_2\)</span> is a linear function of <span class="math inline">\(x_1\)</span>. This implies that our regression approach has no chance of capturing the non-linear nature of the true <span class="math inline">\(p(x_1,x_2)\)</span>. Below is a visual representation of <span class="math inline">\(\hat{p}(x_1, x_2)\)</span>. Regression can’t catch this.</p>
+<p>This implies that for the boundary, <span class="math inline">\(x_2\)</span> is a linear function of <span class="math inline">\(x_1\)</span>, which suggests that our regression approach has no chance of capturing the non-linear nature of the true <span class="math inline">\(p(\mathbf{x})\)</span>. Below is a visual representation of <span class="math inline">\(\hat{p}(\mathbf{x})\)</span> which clearly shows how it fails to capture the shape of <span class="math inline">\(p(\mathbf{x})\)</span>:</p>
 <div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/regression-p-hat_bddf1bb344b6d195ea8dcd44d2e55520">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -514,13 +505,11 @@ <h1 class="title">
 </div>
 </div>
 <p>We need something more flexible: a method that permits estimates with shapes other than a plane. Smoothing techniques permit this flexibility. We will start by describing nearest neighbor and kernel approaches. To understand why we cover this topic, remember that the concepts behind smoothing techniques are extremely useful in machine learning because conditional expectations/probabilities can be thought of as <em>trends</em> of unknown shapes that we need to estimate in the presence of uncertainty.</p>
-</section><section id="signal-plus-noise-model" class="level2" data-number="27.2"><h2 data-number="27.2" class="anchored" data-anchor-id="signal-plus-noise-model">
-<span class="header-section-number">27.2</span> Signal plus noise model</h2>
-<p>To explain these concepts, we will focus first on a problem with just one predictor. Specifically, we try to estimate the time trend in the 2008 US popular vote poll margin (difference between Obama and McCain). Later we will learn about methods such as k-nearest neighbors that can be used to smooth with higher dimensions.</p>
+</section><section id="signal-plus-noise-model" class="level2" data-number="28.2"><h2 data-number="28.2" class="anchored" data-anchor-id="signal-plus-noise-model">
+<span class="header-section-number">28.2</span> Signal plus noise model</h2>
+<p>To explain these concepts, we will focus first on a problem with just one predictor. Specifically, we try to estimate the time trend in the 2008 US popular vote poll margin (the difference between Obama and McCain). Later we will learn about methods, such as k-nearest neighbors, that can be used to smooth with higher dimensions.</p>
 <div class="cell" data-layout-align="center">
-<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
-<span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
-<span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="smoothing_files/figure-html/polls-2008-data-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -543,20 +532,20 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>The line we see does not appear to describe the trend very well. For example, on September 4 (day -62), the Republican Convention was held and the data suggest that it gave John McCain a boost in the polls. However, the regression line does not capture this potential trend. To see the <em>lack of fit</em> more clearly, we note that points above the fitted line (blue) and those below (red) are not evenly distributed across days. We therefore need an alternative, more flexible approach.</p>
-</section><section id="bin-smoothing" class="level2" data-number="27.3"><h2 data-number="27.3" class="anchored" data-anchor-id="bin-smoothing">
-<span class="header-section-number">27.3</span> Bin smoothing</h2>
-<p>The general idea of smoothing is to group data points into strata in which the value of <span class="math inline">\(f(x)\)</span> can be assumed to be constant. We can make this assumption because we think <span class="math inline">\(f(x)\)</span> changes slowly and, as a result, <span class="math inline">\(f(x)\)</span> is almost constant in small windows of time. An example of this idea for the <code>poll_2008</code> data is to assume that public opinion remained approximately the same within a week’s time. With this assumption in place, we have several data points with the same expected value.</p>
+<p>The fitted regression line does not appear to describe the trend very well. For example, on September 4 (day -62), the Republican Convention was held and the data suggest that it gave John McCain a boost in the polls. However, the regression line does not capture this potential trend. To see the <em>lack of fit</em> more clearly, we note that points above the fitted line (blue) and those below (red) are not evenly distributed across days. We therefore need an alternative, more flexible approach.</p>
+</section><section id="bin-smoothing" class="level2" data-number="28.3"><h2 data-number="28.3" class="anchored" data-anchor-id="bin-smoothing">
+<span class="header-section-number">28.3</span> Bin smoothing</h2>
+<p>The general idea of smoothing is to group data points into strata in which the value of <span class="math inline">\(f(x)\)</span> can be assumed to be constant. We can make this assumption when we think <span class="math inline">\(f(x)\)</span> changes slowly and, as a result, <span class="math inline">\(f(x)\)</span> is almost constant in small windows of <span class="math inline">\(x\)</span>. An example of this idea for the <code>poll_2008</code> data is to assume that public opinion remained approximately the same within a week’s time. With this assumption in place, we have several data points with the same expected value.</p>
 <p>If we fix a day to be in the center of our week, call it <span class="math inline">\(x_0\)</span>, then for any other day <span class="math inline">\(x\)</span> such that <span class="math inline">\(|x - x_0| \leq 3.5\)</span>, we assume <span class="math inline">\(f(x)\)</span> is a constant <span class="math inline">\(f(x) = \mu\)</span>. This assumption implies that:</p>
 <p><span class="math display">\[
 E[Y_i | X_i = x_i ] \approx \mu \mbox{   if   }  |x_i - x_0| \leq 3.5
 \]</span></p>
 <p>In smoothing, we call the size of the interval satisfying <span class="math inline">\(|x_i - x_0| \leq 3.5\)</span> the <em>window size</em>, <em>bandwidth</em> or <em>span</em>. Later we will see that we try to optimize this parameter.</p>
-<p>This assumption implies that a good estimate for <span class="math inline">\(f(x)\)</span> is the average of the <span class="math inline">\(Y_i\)</span> values in the window. If we define <span class="math inline">\(A_0\)</span> as the set of indexes <span class="math inline">\(i\)</span> such that <span class="math inline">\(|x_i - x_0| \leq 3.5\)</span> and <span class="math inline">\(N_0\)</span> as the number of indexes in <span class="math inline">\(A_0\)</span>, then our estimate is:</p>
+<p>This assumption implies that a good estimate for <span class="math inline">\(f(x_0)\)</span> is the average of the <span class="math inline">\(Y_i\)</span> values in the window. If we define <span class="math inline">\(A_0\)</span> as the set of indexes <span class="math inline">\(i\)</span> such that <span class="math inline">\(|x_i - x_0| \leq 3.5\)</span> and <span class="math inline">\(N_0\)</span> as the number of indexes in <span class="math inline">\(A_0\)</span>, then our estimate is:</p>
 <p><span class="math display">\[
 \hat{f}(x_0) = \frac{1}{N_0} \sum_{i \in A_0}  Y_i
 \]</span></p>
-<p>The idea behind <em>bin smoothing</em> is to make this calculation with each value of <span class="math inline">\(x\)</span> as the center. In the poll example, for each day, we would compute the average of the values within a week with that day in the center. Here are two examples: <span class="math inline">\(x_0 = -125\)</span> and <span class="math inline">\(x_0 = -55\)</span>. The blue segment represents the resulting average.</p>
+<p>We make this calculation with each value of <span class="math inline">\(x\)</span> as the center. In the poll example, for each day, we would compute the average of the values within a week with that day in the center. Here are two examples: <span class="math inline">\(x_0 = -125\)</span> and <span class="math inline">\(x_0 = -55\)</span>. The blue segment represents the resulting average.</p>
 <div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/binsmoother-expained_30f5c33d37704baae6a81bd6023b4995">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -575,14 +564,14 @@ <h1 class="title">
 </div>
 </div>
 <p>The final code and resulting estimate look like this:</p>
-<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/binsmoother-final_016240511543f3dfe791b37ec4771ac1">
-<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">span</span> <span class="op">&lt;-</span> <span class="fl">7</span> </span>
+<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/binsmoother-final_01109132721722d1339a3ec7b540b1ad">
+<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">span</span> <span class="op">&lt;-</span> <span class="fl">7</span> </span>
 <span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">polls_2008</span>, <span class="fu"><a href="https://rdrr.io/r/stats/ksmooth.html">ksmooth</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span>, kernel <span class="op">=</span> <span class="st">"box"</span>, bandwidth <span class="op">=</span> <span class="va">span</span><span class="op">)</span><span class="op">)</span></span>
 <span></span>
-<span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>smooth <span class="op">=</span> <span class="va">fit</span><span class="op">$</span><span class="va">y</span><span class="op">)</span> <span class="op">|&gt;</span></span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
-<span>    <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span>size <span class="op">=</span> <span class="fl">3</span>, alpha <span class="op">=</span> <span class="fl">.5</span>, color <span class="op">=</span> <span class="st">"grey"</span><span class="op">)</span> <span class="op">+</span> </span>
-<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_path.html">geom_line</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">smooth</span><span class="op">)</span>, color <span class="op">=</span> <span class="st">"red"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu">mutate</span><span class="op">(</span>fit <span class="op">=</span> <span class="va">fit</span><span class="op">$</span><span class="va">y</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span>x <span class="op">=</span> <span class="va">day</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span>y <span class="op">=</span> <span class="va">margin</span><span class="op">)</span>, size <span class="op">=</span> <span class="fl">3</span>, alpha <span class="op">=</span> <span class="fl">.5</span>, color <span class="op">=</span> <span class="st">"grey"</span><span class="op">)</span> <span class="op">+</span> </span>
+<span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_path.html">geom_line</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span>y <span class="op">=</span> <span class="va">fit</span><span class="op">)</span>, color <span class="op">=</span> <span class="st">"red"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="smoothing_files/figure-html/binsmoother-final-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -590,8 +579,8 @@ <h1 class="title">
 </div>
 </div>
 </div>
-</section><section id="kernels" class="level2" data-number="27.4"><h2 data-number="27.4" class="anchored" data-anchor-id="kernels">
-<span class="header-section-number">27.4</span> Kernels</h2>
+</section><section id="kernels" class="level2" data-number="28.4"><h2 data-number="28.4" class="anchored" data-anchor-id="kernels">
+<span class="header-section-number">28.4</span> Kernels</h2>
 <p>The final result from the bin smoother is quite wiggly. One reason for this is that each time the window moves, two points change. We can attenuate this somewhat by taking weighted averages that give the center point more weight than far away points, with the two points at the edges receiving very little weight.</p>
 <p>You can think of the bin smoother approach as a weighted average:</p>
 <p><span class="math display">\[
@@ -606,23 +595,12 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<!--
-With this animation, we see that points on the edge get less weight (the size of the point is proportional to its weight):
-
-
-::: {.cell layout-align="center" hash='smoothing_cache/html/kernel-animation_97c8ff3f2a4acf8b16fbe8a9fc2fc513'}
-::: {.cell-output-display}
-![](img/kernel-animation.gif){fig-align='center' width=70%}
-:::
-:::
-
--->
 <p>The final code and resulting plot for the normal kernel look like this:</p>
 <div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/final-ksmooth-normal-kernel_7fc652505e6f3eddd4f60ffa3cf92f62">
-<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">span</span> <span class="op">&lt;-</span> <span class="fl">7</span></span>
+<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">span</span> <span class="op">&lt;-</span> <span class="fl">7</span></span>
 <span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/with.html">with</a></span><span class="op">(</span><span class="va">polls_2008</span>, <span class="fu"><a href="https://rdrr.io/r/stats/ksmooth.html">ksmooth</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span>, kernel <span class="op">=</span> <span class="st">"normal"</span>, bandwidth <span class="op">=</span> <span class="va">span</span><span class="op">)</span><span class="op">)</span></span>
 <span></span>
-<span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>smooth <span class="op">=</span> <span class="va">fit</span><span class="op">$</span><span class="va">y</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu">mutate</span><span class="op">(</span>smooth <span class="op">=</span> <span class="va">fit</span><span class="op">$</span><span class="va">y</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span>size <span class="op">=</span> <span class="fl">3</span>, alpha <span class="op">=</span> <span class="fl">.5</span>, color <span class="op">=</span> <span class="st">"grey"</span><span class="op">)</span> <span class="op">+</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_path.html">geom_line</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">smooth</span><span class="op">)</span>, color <span class="op">=</span> <span class="st">"red"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -633,20 +611,13 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Notice that the final estimate now looks smoother.</p>
+<p>Notice that this version looks smoother.</p>
 <p>There are several functions in R that implement bin smoothers. One example is <code>ksmooth</code>, shown above. In practice, however, we typically prefer methods that use slightly more complex models than fitting a constant. The final result above, for example, is still somewhat wiggly in parts we don’t expect it to be (between -125 and -75, for example). Methods such as <code>loess</code>, which we explain next, improve on this.</p>
-</section><section id="local-weighted-regression-loess" class="level2" data-number="27.5"><h2 data-number="27.5" class="anchored" data-anchor-id="local-weighted-regression-loess">
-<span class="header-section-number">27.5</span> Local weighted regression (loess)</h2>
+</section><section id="local-weighted-regression-loess" class="level2" data-number="28.5"><h2 data-number="28.5" class="anchored" data-anchor-id="local-weighted-regression-loess">
+<span class="header-section-number">28.5</span> Local weighted regression (loess)</h2>
 <p>A limitation of the bin smoother approach just described is that we need small windows for the approximately constant assumptions to hold. As a result, we end up with a small number of data points to average and obtain imprecise estimates <span class="math inline">\(\hat{f}(x)\)</span>. Here we describe how <em>local weighted regression</em> (loess) permits us to consider larger window sizes. To do this, we will use a mathematical result, referred to as Taylor’s theorem, which tells us that if you look closely enough at any smooth function <span class="math inline">\(f(x)\)</span>, it will look like a line. To see why this makes sense, consider the curved edges gardeners make using straight-edged spades:</p>
-<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/unnamed-chunk-4_8cf91b2a4a80d43262446e211175e296">
-<div class="cell-output-display">
-<div class="quarto-figure quarto-figure-center">
-<figure class="figure"><p><img src="img/garden.png" class="img-fluid figure-img" style="width:70.0%"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>(“Downing Street garden path edge”<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> by Flckr user Number 10<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>. CC-BY 2.0 license<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>.)</p>
+<p><img src="img/garden.png" class="img-fluid"></p>
+<p>(“Downing Street garden path edge”<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> by Flickr user Number 10<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>. CC-BY 2.0 license<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>.)</p>
 <p>Instead of assuming the function is approximately constant in a window, we assume the function is locally linear. We can consider larger window sizes with the linear assumption than with a constant. Instead of the one-week window, we consider a larger one in which the trend is approximately linear. We start with a three-week window and later consider and evaluate other options:</p>
 <p><span class="math display">\[
 E[Y_i | X_i = x_i ] = \beta_0 + \beta_1 (x_i-x_0) \mbox{   if   }  |x_i - x_0| \leq 21
@@ -661,7 +632,7 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>The fitted value at <span class="math inline">\(x_0\)</span> becomes our estimate <span class="math inline">\(\hat{f}(x_0)\)</span>. Below we show the procedure happening as we move from the -155 up to 0.</p>
+<p>The fitted value at <span class="math inline">\(x_0\)</span> becomes our estimate <span class="math inline">\(\hat{f}(x_0)\)</span>. Below we show the procedure happening as we move from the -155 up to 0:</p>
 <div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/loess-animation_d0bffab70f83cfa94d77721628707cdb">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -672,10 +643,10 @@ <h1 class="title">
 </div>
 <p>The final result is a smoother fit than the bin smoother since we use larger sample sizes to estimate our local parameters:</p>
 <div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/final-loess_a0284e349ba6871b017debbecaad8710">
-<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">total_days</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/diff.html">diff</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/range.html">range</a></span><span class="op">(</span><span class="va">polls_2008</span><span class="op">$</span><span class="va">day</span><span class="op">)</span><span class="op">)</span></span>
+<div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">total_days</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/diff.html">diff</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/range.html">range</a></span><span class="op">(</span><span class="va">polls_2008</span><span class="op">$</span><span class="va">day</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">span</span> <span class="op">&lt;-</span> <span class="fl">21</span><span class="op">/</span><span class="va">total_days</span></span>
 <span><span class="va">fit</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/loess.html">loess</a></span><span class="op">(</span><span class="va">margin</span> <span class="op">~</span> <span class="va">day</span>, degree <span class="op">=</span> <span class="fl">1</span>, span <span class="op">=</span> <span class="va">span</span>, data <span class="op">=</span> <span class="va">polls_2008</span><span class="op">)</span></span>
-<span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>smooth <span class="op">=</span> <span class="va">fit</span><span class="op">$</span><span class="va">fitted</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu">mutate</span><span class="op">(</span>smooth <span class="op">=</span> <span class="va">fit</span><span class="op">$</span><span class="va">fitted</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span>size <span class="op">=</span> <span class="fl">3</span>, alpha <span class="op">=</span> <span class="fl">.5</span>, color <span class="op">=</span> <span class="st">"grey"</span><span class="op">)</span> <span class="op">+</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_path.html">geom_line</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">smooth</span><span class="op">)</span>, color <span class="op">=</span> <span class="st">"red"</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -728,20 +699,20 @@ <h1 class="title">
 </div>
 </div>
 <p>3. <code>loess</code> has the option of fitting the local model <em>robustly</em>. An iterative algorithm is implemented in which, after fitting a model in one iteration, outliers are detected and down-weighted for the next iteration. To use this option, we use the argument <code>family="symmetric"</code>.</p>
-<section id="fitting-parabolas" class="level3" data-number="27.5.1"><h3 data-number="27.5.1" class="anchored" data-anchor-id="fitting-parabolas">
-<span class="header-section-number">27.5.1</span> Fitting parabolas</h3>
+<section id="fitting-parabolas" class="level3" data-number="28.5.1"><h3 data-number="28.5.1" class="anchored" data-anchor-id="fitting-parabolas">
+<span class="header-section-number">28.5.1</span> Fitting parabolas</h3>
 <p>Taylor’s theorem also tells us that if you look at any mathematical function closely enough, it looks like a parabola. The theorem also states that you don’t have to look as closely when approximating with parabolas as you do when approximating with lines. This means we can make our windows even larger and fit parabolas instead of lines.</p>
 <p><span class="math display">\[
 E[Y_i | X_i = x_i ] = \beta_0 + \beta_1 (x_i-x_0) + \beta_2 (x_i-x_0)^2 \mbox{   if   }  |x_i - x_0| \leq h
 \]</span></p>
-<p>This is actually the default procedure of the function <code>loess</code>. You may have noticed that when we showed the code for using loess, we set <code>degree = 1</code>. This tells loess to fit polynomials of degree 1, a fancy name for lines. If you read the help page for loess, you will see that the argument <code>degree</code> defaults to 2. By default, loess fits parabolas not lines. Here is a comparison of the fitting lines (red dashed) and fitting parabolas (orange solid):</p>
+<p>You may have noticed that when we showed the code for using loess, we set <code>degree = 1</code>. This tells loess to fit polynomials of degree 1, a fancy name for lines. If you read the help page for loess, you will see that the argument <code>degree</code> defaults to 2. By default, loess fits parabolas not lines. Here is a comparison of the fitting lines (red dashed) and fitting parabolas (orange solid):</p>
 <div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/polls-2008-parabola-line-loess_c8dca5b4077a6a08d53dd056513c2ba0">
-<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">total_days</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/diff.html">diff</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/range.html">range</a></span><span class="op">(</span><span class="va">polls_2008</span><span class="op">$</span><span class="va">day</span><span class="op">)</span><span class="op">)</span></span>
+<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">total_days</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/diff.html">diff</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/range.html">range</a></span><span class="op">(</span><span class="va">polls_2008</span><span class="op">$</span><span class="va">day</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">span</span> <span class="op">&lt;-</span> <span class="fl">28</span><span class="op">/</span><span class="va">total_days</span></span>
 <span><span class="va">fit_1</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/loess.html">loess</a></span><span class="op">(</span><span class="va">margin</span> <span class="op">~</span> <span class="va">day</span>, degree <span class="op">=</span> <span class="fl">1</span>, span <span class="op">=</span> <span class="va">span</span>, data <span class="op">=</span> <span class="va">polls_2008</span><span class="op">)</span></span>
 <span><span class="va">fit_2</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/loess.html">loess</a></span><span class="op">(</span><span class="va">margin</span> <span class="op">~</span> <span class="va">day</span>, span <span class="op">=</span> <span class="va">span</span>, data <span class="op">=</span> <span class="va">polls_2008</span><span class="op">)</span></span>
 <span></span>
-<span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>smooth_1 <span class="op">=</span> <span class="va">fit_1</span><span class="op">$</span><span class="va">fitted</span>, smooth_2 <span class="op">=</span> <span class="va">fit_2</span><span class="op">$</span><span class="va">fitted</span><span class="op">)</span> <span class="op">|&gt;</span></span>
+<span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu">mutate</span><span class="op">(</span>smooth_1 <span class="op">=</span> <span class="va">fit_1</span><span class="op">$</span><span class="va">fitted</span>, smooth_2 <span class="op">=</span> <span class="va">fit_2</span><span class="op">$</span><span class="va">fitted</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span>size <span class="op">=</span> <span class="fl">3</span>, alpha <span class="op">=</span> <span class="fl">.5</span>, color <span class="op">=</span> <span class="st">"grey"</span><span class="op">)</span> <span class="op">+</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_path.html">geom_line</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">smooth_1</span><span class="op">)</span>, color <span class="op">=</span> <span class="st">"red"</span>, lty <span class="op">=</span> <span class="fl">2</span><span class="op">)</span> <span class="op">+</span></span>
@@ -753,12 +724,12 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>The <code>degree = 2</code> gives us more wiggly results. We actually prefer <code>degree = 1</code> as it is less prone to this kind of noise.</p>
-</section><section id="beware-of-default-smoothing-parameters" class="level3" data-number="27.5.2"><h3 data-number="27.5.2" class="anchored" data-anchor-id="beware-of-default-smoothing-parameters">
-<span class="header-section-number">27.5.2</span> Beware of default smoothing parameters</h3>
+<p>The <code>degree = 2</code> gives us more wiggly results. In general, we actually prefer <code>degree = 1</code> as it is less prone to this kind of noise.</p>
+</section><section id="beware-of-default-smoothing-parameters" class="level3" data-number="28.5.2"><h3 data-number="28.5.2" class="anchored" data-anchor-id="beware-of-default-smoothing-parameters">
+<span class="header-section-number">28.5.2</span> Beware of default smoothing parameters</h3>
 <p><code>ggplot</code> uses loess in its <code>geom_smooth</code> function:</p>
 <div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/ggplot-loess-default_509b4a1f14f480e9c48be02c4e586c4c">
-<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
+<div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_smooth.html">geom_smooth</a></span><span class="op">(</span>method <span class="op">=</span> <span class="va">loess</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
@@ -770,7 +741,7 @@ <h1 class="title">
 </div>
 <p>But be careful with default parameters as they are rarely optimal. However, you can conveniently change them:</p>
 <div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/ggplot-loess-degree-1_a4ea3de32bc1d1e055ec3ccb763494d8">
-<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
+<div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">polls_2008</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">day</span>, <span class="va">margin</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_point.html">geom_point</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_smooth.html">geom_smooth</a></span><span class="op">(</span>method <span class="op">=</span> <span class="va">loess</span>, method.args <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html">list</a></span><span class="op">(</span>span <span class="op">=</span> <span class="fl">0.15</span>, degree <span class="op">=</span> <span class="fl">1</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output-display">
@@ -780,33 +751,33 @@ <h1 class="title">
 </div>
 </div>
 </div>
-</section></section><section id="sec-smoothing-ml-connection" class="level2" data-number="27.6"><h2 data-number="27.6" class="anchored" data-anchor-id="sec-smoothing-ml-connection">
-<span class="header-section-number">27.6</span> Connecting smoothing to machine learning</h2>
-<p>To see how smoothing relates to machine learning with a concrete example, consider again our <a href="#sec-two-or-seven"><span>Section&nbsp;27.1</span></a> example. If we define the outcome <span class="math inline">\(Y = 1\)</span> for digits that are seven and <span class="math inline">\(Y=0\)</span> for digits that are 2, then we are interested in estimating the conditional probability:</p>
+</section></section><section id="sec-smoothing-ml-connection" class="level2" data-number="28.6"><h2 data-number="28.6" class="anchored" data-anchor-id="sec-smoothing-ml-connection">
+<span class="header-section-number">28.6</span> Connecting smoothing to machine learning</h2>
+<p>To see how smoothing relates to machine learning with a concrete example, consider again our <a href="#sec-two-or-seven"><span>Section&nbsp;28.1</span></a> example. If we define the outcome <span class="math inline">\(Y = 1\)</span> for digits that are seven and <span class="math inline">\(Y=0\)</span> for digits that are 2, then we are interested in estimating the conditional probability:</p>
 <p><span class="math display">\[
-p(x_1, x_2) = \mbox{Pr}(Y=1 \mid X_1=x_1 , X_2 = x_2).
+p(\mathbf{x}) = \mbox{Pr}(Y=1 \mid X_1=x_1 , X_2 = x_2).
 \]</span></p>
-<p>with <span class="math inline">\(X_1\)</span> and <span class="math inline">\(X_2\)</span> the two predictors defined in Section <a href="#sec-two-or-seven"><span>Section&nbsp;27.1</span></a>). In this example, the 0s and 1s we observe are “noisy” because for some regions the probabilities <span class="math inline">\(p(x_1, x_2)\)</span> are not that close to 0 or 1. So we need to estimate <span class="math inline">\(p(x_1, x_2)\)</span>. Smoothing is an alternative to accomplishing this. In Section @ref(two-or-seven) we saw that linear regression was not flexible enough to capture the non-linear nature of <span class="math inline">\(p(x_1, x_2)\)</span>, thus smoothing approaches provide an improvement. In the next chapter we describe a popular machine learning algorithm, k-nearest neighbors, which is based on bin smoothing.</p>
-</section><section id="exercises" class="level2" data-number="27.7"><h2 data-number="27.7" class="anchored" data-anchor-id="exercises">
-<span class="header-section-number">27.7</span> Exercises</h2>
-<p>1. In the wrangling part of this book, we used the code below to obtain mortality counts for Puerto Rico for 2015-2018.</p>
-<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/unnamed-chunk-5_67dd8bce6f9d1ecd26d4cf58683f5c68">
-<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
+<p>with <span class="math inline">\(x_1\)</span> and <span class="math inline">\(x_2\)</span> the two predictors defined in <a href="#sec-two-or-seven"><span>Section&nbsp;28.1</span></a>. In this example, the 0s and 1s we observe are “noisy” because for some regions the probabilities <span class="math inline">\(p(\mathbf{x})\)</span> are not that close to 0 or 1. We therefore need to estimate <span class="math inline">\(p(\mathbf{x})\)</span>. Smoothing is an alternative to accomplishing this. In <a href="#sec-two-or-seven"><span>Section&nbsp;28.1</span></a>, we saw that linear regression was not flexible enough to capture the non-linear nature of <span class="math inline">\(p(\mathbf{x})\)</span>, thus smoothing approaches provide an improvement. In <a href="cross-validation.html#sec-knn-cv-intro"><span>Section&nbsp;29.1</span></a>, we describe a popular machine learning algorithm, k-nearest neighbors, which is based on the concept of smoothing.</p>
+</section><section id="exercises" class="level2" data-number="28.7"><h2 data-number="28.7" class="anchored" data-anchor-id="exercises">
+<span class="header-section-number">28.7</span> Exercises</h2>
+<p>1. The <strong>dslabs</strong> package provides the following dataset with mortality counts for Puerto Rico for 2015-2018.</p>
+<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/unnamed-chunk-3_018f4e9374a4b5c1dfd1fa1815bdbfc3">
+<div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/utils/head.html">head</a></span><span class="op">(</span><span class="va">pr_death_counts</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Remove data from before May 2018, then use the <code>loess</code> function to obtain a smooth estimate of the expected number of deaths as a function of date. Plot this resulting smooth function. Make the span about two months long.</p>
 <p>2. Plot the smooth estimates against day of the year, all on the same plot but with different colors.</p>
 <p>3. Suppose we want to predict 2s and 7s in our <code>mnist_27</code> dataset with just the second covariate. Can we do this? On first inspection it appears the data does not have much predictive power. In fact, if we fit a regular logistic regression, the coefficient for <code>x_2</code> is not significant!</p>
-<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/unnamed-chunk-6_807b59d4e1b69f69b3c6d74817657c56">
-<div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://broom.tidymodels.org/">broom</a></span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/unnamed-chunk-4_c4ed25351bc61e175b81e6ed0c4108bc">
+<div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://broom.tidymodels.org/">broom</a></span><span class="op">)</span></span>
 <span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
 <span><span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/stats/glm.html">glm</a></span><span class="op">(</span><span class="va">y</span> <span class="op">~</span> <span class="va">x_2</span>, family <span class="op">=</span> <span class="st">"binomial"</span>, data <span class="op">=</span> <span class="va">_</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://generics.r-lib.org/reference/tidy.html">tidy</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Plotting a scatterplot here is not useful since <code>y</code> is binary:</p>
-<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/unnamed-chunk-7_4408c4f9d9895122c1ff8b853162cd4b">
-<div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/qplot.html">qplot</a></span><span class="op">(</span><span class="va">x_2</span>, <span class="va">y</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="smoothing_cache/html/unnamed-chunk-5_0b5759544f7991760bbf2e2d8546d242">
+<div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/qplot.html">qplot</a></span><span class="op">(</span><span class="va">x_2</span>, <span class="va">y</span>, data <span class="op">=</span> <span class="va">mnist_27</span><span class="op">$</span><span class="va">train</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Fit a loess line to the data above and plot the results. Notice that there is predictive power, except the conditional probability is not linear.</p>
 
@@ -1050,12 +1021,12 @@ <h1 class="title">
 });
 </script><nav class="page-navigation"><div class="nav-page nav-page-previous">
       <a href="../ml/conditionals.html" class="pagination-link">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span>
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
       <a href="../ml/cross-validation.html" class="pagination-link">
-        <span class="nav-page-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span> <i class="bi bi-arrow-right-short"></i>
+        <span class="nav-page-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/ml/smoothing_files/figure-html/regression-p-hat-1.png b/docs/ml/smoothing_files/figure-html/regression-p-hat-1.png
index b28f691..9d59b60 100644
Binary files a/docs/ml/smoothing_files/figure-html/regression-p-hat-1.png and b/docs/ml/smoothing_files/figure-html/regression-p-hat-1.png differ
diff --git a/docs/ml/smoothing_files/figure-html/true-p-better-colors-1.png b/docs/ml/smoothing_files/figure-html/true-p-better-colors-1.png
index 69346ef..172ce67 100644
Binary files a/docs/ml/smoothing_files/figure-html/true-p-better-colors-1.png and b/docs/ml/smoothing_files/figure-html/true-p-better-colors-1.png differ
diff --git a/docs/ml/smoothing_files/figure-html/two-or-seven-images-large-x1-1.png b/docs/ml/smoothing_files/figure-html/two-or-seven-images-large-x1-1.png
index 8b86f45..60edb3c 100644
Binary files a/docs/ml/smoothing_files/figure-html/two-or-seven-images-large-x1-1.png and b/docs/ml/smoothing_files/figure-html/two-or-seven-images-large-x1-1.png differ
diff --git a/docs/ml/smoothing_files/figure-html/two-or-seven-scatter-1.png b/docs/ml/smoothing_files/figure-html/two-or-seven-scatter-1.png
index deb91b4..222cb19 100644
Binary files a/docs/ml/smoothing_files/figure-html/two-or-seven-scatter-1.png and b/docs/ml/smoothing_files/figure-html/two-or-seven-scatter-1.png differ
diff --git a/docs/prob/continuous-probability.html b/docs/prob/continuous-probability.html
index 6a2fdbf..f826259 100644
--- a/docs/prob/continuous-probability.html
+++ b/docs/prob/continuous-probability.html
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -425,21 +431,21 @@ <h1 class="title">
   </div>
   
 
-</header><p>In Section <a href="../summaries/distributions.html#sec-ecdf-intro"><span>Section&nbsp;1.2</span></a>, we explained why when summarizing a list of numeric values, such as heights, it is not useful to construct a distribution that defines a proportion to each possible outcome. Similarly, for a random variable that can take any value in a continuous set, it impossible to assign a positive probabilities to the infinite number of possible values. Here we describe how we mathematically define distributions for continuos random variables and useful approximations often used in data analysis.</p>
+</header><p>In <a href="../summaries/distributions.html#sec-ecdf-intro"><span>Section&nbsp;1.2</span></a>, we explained why when summarizing a list of numeric values, such as heights, it is not useful to construct a distribution that defines a proportion to each possible outcome. Similarly, for a random variable that can take any value in a continuous set, it impossible to assign a positive probabilities to the infinite number of possible values. Here, we outline the mathematical definitions of distributions for continuous random variables and useful approximations frequently employed in data analysis.</p>
 <section id="sec-cdf-intro" class="level2" data-number="4.1"><h2 data-number="4.1" class="anchored" data-anchor-id="sec-cdf-intro">
 <span class="header-section-number">4.1</span> Cumulative distribution functions</h2>
-<p>We used the heights of adult male students as an example</p>
-<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-1_d221149dbb9a83b963d84461eba0374f">
+<p>We used the heights of adult male students as an example:</p>
+<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-1_934ed6760bb51a795c1d280620a99b62">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span></span>
 <span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
-<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">heights</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html">%&gt;%</a></span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">sex</span><span class="op">==</span><span class="st">"Male"</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html">%&gt;%</a></span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">heights</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html">%&gt;%</a></span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">sex</span> <span class="op">==</span> <span class="st">"Male"</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html">%&gt;%</a></span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/pull.html">pull</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>and defined the empirical cumulative distribution function (eCDF) as</p>
-<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-2_e94894f7c6df49f8f222526ab50c93b6">
-<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="cn">F</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">a</span><span class="op">)</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span><span class="op">&lt;=</span><span class="va">a</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-2_7a575f5012f78e44969aea7d7d85043c">
+<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="cn">F</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">a</span><span class="op">)</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span> <span class="op">&lt;=</span> <span class="va">a</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>which, for any value <code>a</code>, gives the proportion of values in the list <code>x</code> that are smaller or equal than <code>a</code>.</p>
-<p>Let’s connect the eCDF to probability by asking: if I pick one of the male students at random, what is the chance that he is taller than 70.5 inches? Because every student has the same chance of being picked, the answer to this is equivalent to the proportion of students that are taller than 70.5 inches. Using the eCDF we obtain an answer by typing:</p>
+<p>Let’s connect the eCDF to probability by asking: if I randomly pick one of the male students, what is the chance that he is taller than 70.5 inches? Since every student has the same chance of being picked, the answer to this is equivalent to the proportion of students that are taller than 70.5 inches. Using the eCDF we obtain an answer by typing:</p>
 <div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-3_218a19abd60e997e742c57a72d4aed68">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fl">1</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/logical.html">F</a></span><span class="op">(</span><span class="fl">70</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.377</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -450,14 +456,15 @@ <h1 class="title">
 <p><span class="math display">\[
 \mbox{Pr}(a &lt; X \leq b) = F(b)-F(a)
 \]</span></p>
-<p>Because we can compute the probability for any possible event this way, the CDF defines the probability distribution.</p>
+<p>Since we can compute the probability for any possible event using this approach, the CDF defines the probability distribution.</p>
 </section><section id="probability-density-function" class="level2" data-number="4.2"><h2 data-number="4.2" class="anchored" data-anchor-id="probability-density-function">
 <span class="header-section-number">4.2</span> Probability density function</h2>
-<p>A mathematical result that is actually very useful in practice is that for most CDFs we can define a function, call it <span class="math inline">\(f(x)\)</span>, that permits us to construct the CDF using Calculus, like this:</p>
+<p>A mathematical result that is very useful in practice is that, for most CDFs, we can define a function, call it <span class="math inline">\(f(x)\)</span>, that permits us to construct the CDF using Calculus, like this:</p>
 <p><span class="math display">\[
 F(b) - F(a) = \int_a^b f(x)\,dx
-\]</span> <span class="math inline">\(f(x)\)</span> is referred to as the <em>probability density function</em>. The intuition is that even for continuous outcomes we can define tiny intervals, that are almost as small as points, that have positive probabilities. If we think of the size of these intervals as the base of a rectangle,the probability density function <span class="math inline">\(f\)</span> determines the height of the rectangle so that the summing up the area of these rectangles approximate the probability <span class="math inline">\(F(b) - F(a)\)</span>. This sum can be written as Reimann sum that is approximated by an integral:</p>
-<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-4_0edfbb684b4f6743cb0b8c9beda3a824">
+\]</span></p>
+<p><span class="math inline">\(f(x)\)</span> is referred to as the <em>probability density function</em>. The intuition is that even for continuous outcomes we can define tiny intervals, that are almost as small as points, that have positive probabilities. If we think of the size of these intervals as the base of a rectangle, the probability density function <span class="math inline">\(f\)</span> determines the height of the rectangle so that the summing up of the area of these rectangles approximate the probability <span class="math inline">\(F(b) - F(a)\)</span>. This sum can be written as Reimann sum that is approximated by an integral:</p>
+<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-4_684141dec0ac5d354a9fb69a7083de93">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="continuous-probability_files/figure-html/unnamed-chunk-4-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -471,7 +478,7 @@ <h1 class="title">
 <div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-5_1d0d526365fca5ecb5193ff3812701db">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/logical.html">F</a></span><span class="op">(</span><span class="va">a</span><span class="op">)</span> <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="va">a</span>, <span class="va">m</span>, <span class="va">s</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This is useful because if we are willing to use the normal approximation for, say, height, we don’t need the entire dataset to answer questions such as: what is the probability that a randomly selected student is taller then 70 inches? We just need the average height and standard deviation:</p>
+<p>This is useful because, if we are willing to use the normal approximation for, let’s say, height, we don’t need the entire dataset to answer questions such as: What is the probability that a randomly selected student is taller then 70 inches? We just need the average height and standard deviation:</p>
 <div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-6_b7ea526b48d37f28ca9e8fadc13e72e4">
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">m</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="va">s</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
@@ -480,7 +487,7 @@ <h1 class="title">
 </div>
 </section><section id="theoretical-distributions-as-approximations" class="level2" data-number="4.3"><h2 data-number="4.3" class="anchored" data-anchor-id="theoretical-distributions-as-approximations">
 <span class="header-section-number">4.3</span> Theoretical distributions as approximations</h2>
-<p>The normal distribution is derived mathematically: we do not need data to define it. For practicing data scientists, almost everything we do involves data. Data is always, technically speaking, discrete. For example, we could consider our height data categorical with each specific height a unique category. The probability distribution is defined by the proportion of students reporting each height. Here is a plot of that probability distribution:</p>
+<p>The normal distribution is derived mathematically; we do not need data to define it. For practicing data scientists, almost everything we do involves data. Data is always, technically speaking, discrete. For example, we could consider our height data categorical, with each specific height a unique category. The probability distribution is defined by the proportion of students reporting each height. Below is a plot of that probability distribution:</p>
 <div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/plot-of-height-frequencies_24d22d637178bc0435b568bb460a84d7">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -490,7 +497,7 @@ <h1 class="title">
 </div>
 </div>
 <p>While most students rounded up their heights to the nearest inch, others reported values with more precision. One student reported his height to be 69.6850393700787, which is 177 centimeters. The probability assigned to this height is 0.0012315 or 1 in 812. The probability for 70 inches is much higher at 0.1059113, but does it really make sense to think of the probability of being exactly 70 inches as being different than 69.6850393700787? Clearly it is much more useful for data analytic purposes to treat this outcome as a continuous numeric variable, keeping in mind that very few people, or perhaps none, are exactly 70 inches, and that the reason we get more values at 70 is because people round to the nearest inch.</p>
-<p>With continuous distributions, the probability of a singular value is not even defined. For example, it does not make sense to ask what is the probability that a normally distributed value is 70. Instead, we define probabilities for intervals. We thus could ask what is the probability that someone is between 69.5 and 70.5.</p>
+<p>With continuous distributions, the probability of a singular value is not even defined. For instance, it does not make sense to ask what is the probability that a normally distributed value is 70. Instead, we define probabilities for intervals. We therefore could ask, what is the probability that someone is between 69.5 and 70.5?</p>
 <p>In cases like height, in which the data is rounded, the normal approximation is particularly useful if we deal with intervals that include exactly one round number. For example, the normal distribution is useful for approximating the proportion of students reporting values in intervals like the following three:</p>
 <div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-7_ceeb514669b150e45cb0708241f2bb12">
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span> <span class="op">&lt;=</span> <span class="fl">68.5</span><span class="op">)</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span> <span class="op">&lt;=</span> <span class="fl">67.5</span><span class="op">)</span></span>
@@ -510,11 +517,11 @@ <h1 class="title">
 <span><span class="co">#&gt; [1] 0.108</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>However, the approximation is not as useful for other intervals. For instance, notice how the approximation breaks down when we try to estimate:</p>
-<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-9_846ddd4e4b8392903eff78692eabb756">
-<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span> <span class="op">&lt;=</span> <span class="fl">70.9</span><span class="op">)</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span><span class="op">&lt;=</span><span class="fl">70.1</span><span class="op">)</span></span>
+<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-9_dc3ba8df21143d81b583428b585a633a">
+<div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span> <span class="op">&lt;=</span> <span class="fl">70.9</span><span class="op">)</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span> <span class="op">&lt;=</span> <span class="fl">70.1</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.0222</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>with</p>
+<p>with:</p>
 <div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-10_370e26032aeefab5a9651e87076874f2">
 <div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="fl">70.9</span>, <span class="va">m</span>, <span class="va">s</span><span class="op">)</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="fl">70.1</span>, <span class="va">m</span>, <span class="va">s</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.0836</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -526,21 +533,22 @@ <h1 class="title">
 <p><span class="math display">\[
 \mbox{Pr}(X=4) = 1/6
 \]</span></p>
-<p>The CDF can then easily be defined: <span class="math display">\[
+<p>The CDF can then easily be defined:</p>
+<p><span class="math display">\[
 F(4) = \mbox{Pr}(X\leq 4) =  \mbox{Pr}(X = 4) +  \mbox{Pr}(X = 3) +  \mbox{Pr}(X = 2) +  \mbox{Pr}(X = 1)
 \]</span></p>
 <p>Although for continuous distributions the probability of a single value <span class="math inline">\(\mbox{Pr}(X=x)\)</span> is not defined, there is a theoretical definition that has a similar interpretation. The probability density at <span class="math inline">\(x\)</span> is defined as the function <span class="math inline">\(f(a)\)</span> such that:</p>
 <p><span class="math display">\[
 F(a) = \mbox{Pr}(X\leq a) = \int_{-\infty}^a f(x)\, dx
 \]</span></p>
-<p>For those that know calculus, remember that the integral is related to a sum: it is the sum of bars with widths approximating 0. If you don’t know calculus, you can think of <span class="math inline">\(f(x)\)</span> as a curve for which the area under that curve up to the value <span class="math inline">\(a\)</span>, gives you the probability <span class="math inline">\(\mbox{Pr}(X\leq a)\)</span>.</p>
+<p>For those that know calculus, remember that the integral is related to a sum: it is the sum of bars with widths approximating 0. If you don’t know calculus, you can think of <span class="math inline">\(f(x)\)</span> as a curve for which the area under that curve, up to the value <span class="math inline">\(a\)</span>, gives you the probability <span class="math inline">\(\mbox{Pr}(X\leq a)\)</span>.</p>
 <p>For example, to use the normal approximation to estimate the probability of someone being taller than 76 inches, we use:</p>
 <div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-11_bbfde14cb3eda7b741fde576c608d824">
 <div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fl">1</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="fl">76</span>, <span class="va">m</span>, <span class="va">s</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.0321</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>which mathematically is the grey area below:</p>
-<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/intergrals_9cd893f8608aebe28c7574d06398ef44">
+<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/intergrals_8e244349c9e5c86cf15956899b74134e">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="continuous-probability_files/figure-html/intergrals-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -549,10 +557,10 @@ <h1 class="title">
 </div>
 </div>
 <p>The curve you see is the probability density for the normal distribution. In R, we get this using the function <code>dnorm</code>.</p>
-<p>Although it may not be immediately obvious why knowing about probability densities is useful, understanding this concept will be essential to those wanting to fit models to data for which predefined functions are not available.</p>
+<p>While it may not be immediately apparent why knowing about probability densities is useful, understanding this concept is essential for individuals aiming to fit models to data for which predefined functions are not available.</p>
 </section><section id="monte-carlo" class="level2" data-number="4.5"><h2 data-number="4.5" class="anchored" data-anchor-id="monte-carlo">
 <span class="header-section-number">4.5</span> Monte Carlo</h2>
-<p>R provides functions to generate normally distributed outcomes. Specifically, the <code>rnorm</code> function takes three arguments: size, average (defaults to 0), and standard deviation (defaults to 1) and produces random numbers. Here is an example of how we could generate data that looks like our reported heights:</p>
+<p>R provides functions to generate normally distributed outcomes. Specifically, the <code>rnorm</code> function takes three arguments: size, average (defaults to 0), and standard deviation (defaults to 1), and produces random numbers. Here is an example of how we could generate data that looks like our reported heights:</p>
 <div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-12_5a75a6e86ae22387fb9301d80287b3bd">
 <div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
 <span><span class="va">m</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
@@ -560,7 +568,7 @@ <h1 class="title">
 <span><span class="va">simulated_heights</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">rnorm</a></span><span class="op">(</span><span class="va">n</span>, <span class="va">m</span>, <span class="va">s</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Not surprisingly, the distribution looks normal:</p>
-<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/simulated-heights_f260ab74ac1040e0a0ac683a299cb3cb">
+<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/simulated-heights_5a54bc4ce646005218bb41b400cd2aae">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="continuous-probability_files/figure-html/simulated-heights-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -568,8 +576,8 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>This is one of the most useful functions in R as it will permit us to generate data that mimics natural events and answers questions related to what could happen by chance by running Monte Carlo simulations.</p>
-<p>If, for example, we pick 800 males at random, what is the distribution of the tallest person? How rare is a seven footer in a group of 800 males? The following Monte Carlo simulation helps us answer that question:</p>
+<p>This is one of the most useful functions in R, as it will permit us to generate data that mimics natural events and answers questions related to what could happen by chance by running Monte Carlo simulations.</p>
+<p>If, for example, we pick 800 males at random, what is the distribution of the tallest person? How rare is a seven-footer in a group of 800 males? The following Monte Carlo simulation helps us answer that question:</p>
 <div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-13_e153e340e8bcc2087194b611934bf01c">
 <div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10000</span></span>
 <span><span class="va">tallest</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="op">{</span></span>
@@ -577,13 +585,13 @@ <h1 class="title">
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="va">simulated_data</span><span class="op">)</span></span>
 <span><span class="op">}</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Having a seven footer is quite rare:</p>
+<p>Having a seven-footer is quite rare:</p>
 <div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/unnamed-chunk-14_6e0a1e398a431526402ffb2751fff1bd">
 <div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">tallest</span> <span class="op">&gt;=</span> <span class="fl">7</span><span class="op">*</span><span class="fl">12</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] 0.0156</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt; [1] 0.0191</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Here is the resulting distribution:</p>
-<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/simulated-tallest-height_c4814780805482698e2d77d2f382924c">
+<div class="cell" data-layout-align="center" data-hash="continuous-probability_cache/html/simulated-tallest-height_65e724aa76d2fe1e5676f9fb356830b6">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="continuous-probability_files/figure-html/simulated-tallest-height-1.png" class="img-fluid figure-img" style="width:70.0%"></p>
@@ -599,7 +607,7 @@ <h1 class="title">
 <div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="op">-</span><span class="fl">4</span>, <span class="fl">4</span>, length.out <span class="op">=</span> <span class="fl">100</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/qplot.html">qplot</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">f</span>, geom <span class="op">=</span> <span class="st">"line"</span>, data <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html">data.frame</a></span><span class="op">(</span><span class="va">x</span>, f <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">dnorm</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>For the student-t, described later in Section <a href="../inference/models.html#sec-t-dist"><span>Section&nbsp;10.2.3</span></a>, the shorthand <code>t</code> is used so the functions are <code>dt</code> for the density, <code>qt</code> for the quantiles, <code>pt</code> for the cumulative distribution function, and <code>rt</code> for Monte Carlo simulation.</p>
+<p>For the student-t, described later in <a href="../inference/models.html#sec-t-dist"><span>Section&nbsp;11.2.3</span></a>, the shorthand <code>t</code> is used so the functions are <code>dt</code> for the density, <code>qt</code> for the quantiles, <code>pt</code> for the cumulative distribution function, and <code>rt</code> for Monte Carlo simulation.</p>
 </section><section id="exercises" class="level2" data-number="4.7"><h2 data-number="4.7" class="anchored" data-anchor-id="exercises">
 <span class="header-section-number">4.7</span> Exercises</h2>
 <p>1. Assume the distribution of female heights is approximated by a normal distribution with a mean of 64 inches and a standard deviation of 3 inches. If we pick a female at random, what is the probability that she is 5 feet or shorter?</p>
@@ -607,21 +615,21 @@ <h1 class="title">
 <p>3. Assume the distribution of female heights is approximated by a normal distribution with a mean of 64 inches and a standard deviation of 3 inches. If we pick a female at random, what is the probability that she is between 61 and 67 inches?</p>
 <p>4. Repeat the exercise above, but convert everything to centimeters. That is, multiply every height, including the standard deviation, by 2.54. What is the answer now?</p>
 <p>5. Notice that the answer to the question does not change when you change units. This makes sense since the standard deviations from the average for an entry in a list are not affected by what units we use. In fact, if you look closely, you notice that 61 and 67 are both 1 SD away from the average. Compute the probability that a randomly picked, normally distributed random variable is within 1 SD from the average.</p>
-<p>6. To see the math that explains why the answers to questions 3, 4, and 5 are the same, suppose we have a random variable with average <span class="math inline">\(m\)</span> and standard error <span class="math inline">\(s\)</span>. Suppose we ask the probability of <span class="math inline">\(X\)</span> being smaller or equal to <span class="math inline">\(a\)</span>. Remember that, by definition, <span class="math inline">\(a\)</span> is <span class="math inline">\((a - m)/s\)</span> standard deviations <span class="math inline">\(s\)</span> away from the average <span class="math inline">\(m\)</span>. The probability is:</p>
+<p>6. To understand the mathematical rationale that explains why the answers to exercises 3, 4, and 5 are the same, suppose we have a random variable with average <span class="math inline">\(m\)</span> and standard error <span class="math inline">\(s\)</span>. Suppose we ask the probability of <span class="math inline">\(X\)</span> being smaller or equal to <span class="math inline">\(a\)</span>. Remember that, by definition, <span class="math inline">\(a\)</span> is <span class="math inline">\((a - m)/s\)</span> standard deviations <span class="math inline">\(s\)</span> away from the average <span class="math inline">\(m\)</span>. The probability is:</p>
 <p><span class="math display">\[
 \mbox{Pr}(X \leq a)
 \]</span></p>
 <p>Now we subtract <span class="math inline">\(\mu\)</span> to both sides and then divide both sides by <span class="math inline">\(\sigma\)</span>:</p>
 <p><span class="math display">\[
-\mbox{Pr}\left(\frac{X-m}{s} \leq \frac{a-m}{s} \right)
+\mbox{Pr}\left(\frac{X-\mu}{\sigma} \leq \frac{a-\mu}{\sigma} \right)
 \]</span></p>
 <p>The quantity on the left is a standard normal random variable. It has an average of 0 and a standard error of 1. We will call it <span class="math inline">\(Z\)</span>:</p>
 <p><span class="math display">\[
-\mbox{Pr}\left(Z \leq \frac{a-m}{s} \right)
+\mbox{Pr}\left(Z \leq \frac{a-\mu}{\sigma} \right)
 \]</span></p>
-<p>So, no matter the units, the probability of <span class="math inline">\(X\leq a\)</span> is the same as the probability of a standard normal variable being less than <span class="math inline">\((a - m)/s\)</span>. If <code>mu</code> is the average and <code>sigma</code> the standard error, which of the following R code would give us the right answer in every situation:</p>
+<p>So, no matter the units, the probability of <span class="math inline">\(X\leq a\)</span> is the same as the probability of a standard normal variable being less than <span class="math inline">\((a - \mu)/\sigma\)</span>. If <code>mu</code> is the average and <code>sigma</code> the standard error, which of the following R code would give us the right answer in every situation?</p>
 <ol type="a">
-<li><code>mean(X&lt;=a)</code></li>
+<li><code>mean(X &lt;= a)</code></li>
 <li><code>pnorm((a - m)/s)</code></li>
 <li><code>pnorm((a - m)/s, m, s)</code></li>
 <li><code>pnorm(a)</code></li>
diff --git a/docs/prob/continuous-probability_files/figure-html/simulated-heights-1.png b/docs/prob/continuous-probability_files/figure-html/simulated-heights-1.png
index bb96526..8817120 100644
Binary files a/docs/prob/continuous-probability_files/figure-html/simulated-heights-1.png and b/docs/prob/continuous-probability_files/figure-html/simulated-heights-1.png differ
diff --git a/docs/prob/continuous-probability_files/figure-html/simulated-tallest-height-1.png b/docs/prob/continuous-probability_files/figure-html/simulated-tallest-height-1.png
index f1711de..2b37b58 100644
Binary files a/docs/prob/continuous-probability_files/figure-html/simulated-tallest-height-1.png and b/docs/prob/continuous-probability_files/figure-html/simulated-tallest-height-1.png differ
diff --git a/docs/prob/discrete-probability.html b/docs/prob/discrete-probability.html
index 870a875..b28944f 100644
--- a/docs/prob/discrete-probability.html
+++ b/docs/prob/discrete-probability.html
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -453,19 +459,19 @@ <h1 class="title">
   </div>
   
 
-</header><p>We start by covering some basic principles related to categorical data. The subset of probability is referred to as <em>discrete probability</em>. It will help us understand the probability theory we will later introduce for numeric and continuous data, which is much more common in data science applications. Discrete probability is more useful in card games and therefore we use these as examples.</p>
+</header><p>We begin by covering some basic principles related to categorical data. The specific area of probability which deals with categorical data is referred to as <em>discrete probability</em>. Understanding this topic, will help us comprehend the probability theory we will later introduce for numeric and continuous data, which is much more common in data analysis. Since discrete probability is invaluable in card games, we will use these as illustrative examples.</p>
 <section id="relative-frequency" class="level2" data-number="3.1"><h2 data-number="3.1" class="anchored" data-anchor-id="relative-frequency">
 <span class="header-section-number">3.1</span> Relative frequency</h2>
-<p>The word probability is used in everyday language. Answering questions about probability is often hard, if not impossible. Here we discuss a mathematical definition of <em>probability</em> that does permit us to give precise answers to certain questions.</p>
-<p>For example, if I have 2 red beads and 3 blue beads inside an urn<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> (most probability books use this archaic term, so we do too) and I pick one at random, what is the probability of picking a red one? Our intuition tells us that the answer is 2/5 or 40%. A precise definition can be given by noting that there are five possible outcomes of which two satisfy the condition necessary for the event “pick a red bead”. Since each of the five outcomes has the same chance of occurring, we conclude that the probability is .4 for red and .6 for blue.</p>
+<p>The term <em>probability</em> is used in everyday language. Yet answering questions about probability is often hard, if not impossible. In this section, we discuss a mathematical definition of <em>probability</em> that allows us to give precise answers to certain questions.</p>
+<p>For example, if I have 2 red beads and 3 blue beads inside an urn<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> (most probability books use this archaic term, so we do too) and I pick one at random, what is the probability of picking a red one? Our intuition tells us that the answer is 2/5 or 40%. A precise definition can be given by noting that there are five possible outcomes, of which two satisfy the condition necessary for the event “pick a red bead”. Since each of the five outcomes has an equal chance of occurring, we conclude that the probability is .4 for red and .6 for blue.</p>
 <p>A more tangible way to think about the probability of an event is as the proportion of times the event occurs when we repeat the experiment an infinite number of times, independently, and under the same conditions.</p>
 </section><section id="notation" class="level2" data-number="3.2"><h2 data-number="3.2" class="anchored" data-anchor-id="notation">
 <span class="header-section-number">3.2</span> Notation</h2>
-<p>We use the notation <span class="math inline">\(\mbox{Pr}(A)\)</span> to denote the probability of event <span class="math inline">\(A\)</span> happening. We use the very general term <em>event</em> to refer to things that can happen when something occurs by chance. In our previous example, the event was “picking a red bead”. In a political poll in which we call 100 likely voters at random, an example of an event is “calling 48 Democrats and 52 Republicans”.</p>
-<p>In data science applications, we will often deal with continuous variables. These events will often be things like “is this person taller than 6 feet”. In this case, we write events in a more mathematical form: <span class="math inline">\(X \geq 6\)</span>. We will see more of these examples later. Here we focus on categorical data.</p>
+<p>We use the notation <span class="math inline">\(\mbox{Pr}(A)\)</span> to denote the probability of event <span class="math inline">\(A\)</span> occurring. We use the very general term <em>event</em> to refer to things that can happen when something occurs by chance. In our previous example, the event was “picking a red bead.” In a political poll, where we randomly phone 100 likely voters at random, an example of an event is “calling 48 Democrats and 52 Republicans.”</p>
+<p>In data science applications, we often encounter continuous variables. These events will often be questions, such as “Is this person taller than 6 feet?” In these cases, we represent events in a more mathematical form: <span class="math inline">\(X \geq 6\)</span>. We will see more of these examples later, but for now, we will focus on categorical data.</p>
 </section><section id="probability-distributions" class="level2" data-number="3.3"><h2 data-number="3.3" class="anchored" data-anchor-id="probability-distributions">
 <span class="header-section-number">3.3</span> Probability distributions</h2>
-<p>If we know the relative frequency of the different categories, defining a distribution for categorical outcomes is relatively straightforward. We simply assign a probability to each category. In cases that can be thought of as beads in an urn, for each bead type, their proportion defines the distribution.</p>
+<p>If we know the relative frequency of the different categories, defining a distribution for categorical outcomes is relatively straightforward. We simply assign a probability to each category. In cases analogous to beads in an urn, for each bead type, their proportion defines the distribution.</p>
 <p>If we are randomly calling likely voters from a population that is 44% Democrat, 44% Republican, 10% undecided, and 2% Green Party, these proportions define the probability for each group. The probability distribution is:</p>
 <table class="table"><tbody>
 <tr class="odd">
@@ -500,22 +506,22 @@ <h1 class="title">
 <p>and then use <code>sample</code> to pick a bead at random:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-2_1bd37199204d9a15467331c8f536dacf">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">beads</span>, <span class="fl">1</span><span class="op">)</span></span>
-<span><span class="co">#&gt; [1] "blue"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt; [1] "red"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This line of code produces one random outcome. We want to repeat this experiment an infinite number of times, but it is impossible to repeat forever. Instead, we repeat the experiment a large enough number of times to make the results practically equivalent to repeating forever. <strong>This is an example of a <em>Monte Carlo</em> simulation</strong>.</p>
-<p>Much of what mathematical and theoretical statisticians study, which we do not cover in this book, relates to providing rigorous definitions of “practically equivalent” as well as studying how close a large number of experiments gets us to what happens in the limit. Later in this section, we provide a practical approach to deciding what is “large enough”.</p>
-<p>To perform our first Monte Carlo simulation, we use the <code>replicate</code> function, which permits us to repeat the same task any number of times. Here, we repeat the random event <span class="math inline">\(B =\)</span> 10,000 times:</p>
+<p>This line of code produces a single random outcome. We want to repeat this experiment an infinite number of times, but it is impossible to repeat indefinitely. Instead, we repeat the experiment a large enough number of times to make the results practically equivalent to repeating indefinitely. <strong>This is an example of a <em>Monte Carlo</em> simulation</strong>.</p>
+<p>Much of what mathematical and theoretical statisticians study, topics that we do not cover in this book, relates to providing rigorous definitions of “practically equivalent”. Additionally, they explore how close a large number of experiments brings us to what happens in the limit. Later in this section, we provide a practical approach to determining what is “large enough”.</p>
+<p>To perform our first Monte Carlo simulation, we use the <code>replicate</code> function, which allows us to repeat the same task any number of times. Here, we repeat the random event <span class="math inline">\(B =\)</span> 10,000 times:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-3_b00b5358cb63f5292d8be3cb18dfd6da">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10000</span></span>
 <span><span class="va">events</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">beads</span>, <span class="fl">1</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We can now see if our definition actually is in agreement with this Monte Carlo simulation approximation. We can use <code>table</code> to see the distribution:</p>
+<p>We can now verify if our definition actually is in agreement with this Monte Carlo simulation approximation. We use <code>table</code> to see the distribution:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-4_fddc8f2a374a67000bbb7d0d65c0cda3">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">tab</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/table.html">table</a></span><span class="op">(</span><span class="va">events</span><span class="op">)</span></span>
 <span><span class="va">tab</span></span>
 <span><span class="co">#&gt; events</span></span>
 <span><span class="co">#&gt; blue  red </span></span>
-<span><span class="co">#&gt; 6028 3972</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="co">#&gt; 6027 3973</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>and <code>prop.table</code> gives us the proportions:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-5_f5fd9d1d9bcb9939a76e2c96f902a48f">
@@ -524,15 +530,15 @@ <h1 class="title">
 <span><span class="co">#&gt;  blue   red </span></span>
 <span><span class="co">#&gt; 0.603 0.397</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The numbers above are the estimated probabilities provided by this Monte Carlo simulation. Statistical theory, not covered here, tells us that as <span class="math inline">\(B\)</span> gets larger, the estimates get closer to 3/5=.6 and 2/5=.4.</p>
-<p>Although this is a simple and not very useful example, we will use Monte Carlo simulations to estimate probabilities in cases in which it is harder to compute the exact ones. Before delving into more complex examples, we use simple ones to demonstrate the computing tools available in R.</p>
+<p>The numbers above represent the estimated probabilities obtained by this Monte Carlo simulation. Statistical theory, not covered here, tells us that as <span class="math inline">\(B\)</span> gets larger, the estimates get closer to 3/5=.6 and 2/5=.4.</p>
+<p>This is a simple and not very useful example, since we can easily compute the probabilities mathematically. Monte Carlo simulations are useful when it is hard, or impossible, to compute the exact probabilities mathematically. Before delving into more complex examples, we use simple ones to demonstrate the computing tools available in R.</p>
 <section id="setting-the-random-seed" class="level3" data-number="3.4.1"><h3 data-number="3.4.1" class="anchored" data-anchor-id="setting-the-random-seed">
 <span class="header-section-number">3.4.1</span> Setting the random seed</h3>
 <p>Before we continue, we will briefly explain the following important line of code:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-6_4446cd75066e6f0b7d506bdc21dc08f5">
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/Random.html">set.seed</a></span><span class="op">(</span><span class="fl">1986</span><span class="op">)</span> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Throughout this book, we use random number generators. This implies that many of the results presented can actually change by chance, which then suggests that a frozen version of the book may show a different result than what you obtain when you try to code as shown in the book. This is actually fine since the results are random and change from time to time. However, if you want to ensure that results are exactly the same every time you run them, you can set R’s random number generation seed to a specific number. Above we set it to 1986. We want to avoid using the same seed everytime. A popular way to pick the seed is the year - month - day. For example, we picked 1986 on December 20, 2018: <span class="math inline">\(2018 - 12 - 20 = 1986\)</span>.</p>
+<p>Throughout this book, we use random number generators. This implies that many of the results presented can potentially change by chance, indicating that a static version of the book may show a different result than what you obtain when following the code as presented. This is actually fine, given that the results are random and change over time. However, if you want to ensure that results are consistent with each run, you can set R’s random number generation seed to a specific number. Above we set it to 1986. We want to avoid using the same seed every time. A popular way to pick the seed is the year - month - day. For example, we chose 1986 on December 20, 2018: <span class="math inline">\(2018 - 12 - 20 = 1986\)</span>.</p>
 <p>You can learn more about setting the seed by looking at the documentation:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-7_f4fc11e424a73f3dc1c468c174ea33c6">
 <div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="op">?</span><span class="va">set.seed</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -540,7 +546,7 @@ <h1 class="title">
 <p>In the exercises, we may ask you to set the seed to assure that the results you obtain are exactly what we expect them to be.</p>
 </section><section id="with-and-without-replacement" class="level3" data-number="3.4.2"><h3 data-number="3.4.2" class="anchored" data-anchor-id="with-and-without-replacement">
 <span class="header-section-number">3.4.2</span> With and without replacement</h3>
-<p>The function <code>sample</code> has an argument that permits us to pick more than one element from the urn. However, by default, this selection occurs <em>without replacement</em>: after a bead is selected, it is not put back in the bag. Notice what happens when we ask to randomly select five beads:</p>
+<p>The function <code>sample</code> has an argument that allows us to pick more than one element from the urn. However, by default, this selection occurs <em>without replacement</em>: after a bead is selected, it is not returned to the bag. Notice what happens when we ask to randomly select five beads:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-8_ed49d54d45b69c6a13ee6e1b8d6f04b9">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">beads</span>, <span class="fl">5</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] "red"  "blue" "blue" "blue" "red"</span></span>
@@ -549,7 +555,7 @@ <h1 class="title">
 <span><span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">beads</span>, <span class="fl">5</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] "blue" "red"  "blue" "red"  "blue"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This results in rearrangements that always have three blue and two red beads. If we ask that six beads be selected, we get an error:</p>
+<p>This results in rearrangements that consistently comprise three blue and two red beads. If we ask that six beads be selected, we get an error:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-9_d25bb9488d7723e9a1b575dccce57add">
 <div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">beads</span>, <span class="fl">6</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
@@ -565,25 +571,25 @@ <h1 class="title">
 <p>Not surprisingly, we get results very similar to those previously obtained with <code>replicate</code>.</p>
 </section></section><section id="independence" class="level2" data-number="3.5"><h2 data-number="3.5" class="anchored" data-anchor-id="independence">
 <span class="header-section-number">3.5</span> Independence</h2>
-<p>We say two events are independent if the outcome of one does not affect the other. The classic example is coin tosses. Every time we toss a fair coin, the probability of seeing heads is 1/2 regardless of what previous tosses have revealed. The same is true when we pick beads from an urn with replacement. In the example above, the probability of red is 0.40 regardless of previous draws.</p>
-<p>Many examples of events that are not independent come from card games. When we deal the first card, the probability of getting a King is 1/13 since there are thirteen possibilities: Ace, Deuce, Three, <span class="math inline">\(\dots\)</span>, Ten, Jack, Queen, King, and Ace. Now if we deal a King for the first card, and don’t replace it into the deck, the probabilities of a second card being a King is less because there are only three Kings left: the probability is 3 out of 51. These events are therefore <strong>not independent</strong>: the first outcome affected the next one.</p>
+<p>We say two events are independent if the outcome of one does not affect the other. The classic example is coin tosses. Every time we toss a fair coin, the probability of seeing heads is 1/2, regardless of what previous tosses have revealed. The same is true when we pick beads from an urn with replacement. In the example above, the probability of red is 0.40 regardless of previous draws.</p>
+<p>Many examples of events that are not independent come from card games. When we deal the first card, the probability of getting a King is 1/13 since there are thirteen possibilities: Ace, Deuce, Three, <span class="math inline">\(\dots\)</span>, Ten, Jack, Queen, King, and Ace. Now, if we deal a King for the first card and do not replace it in the deck, the probability of a second card being a King decreases because there are only three Kings left. The probability is 3 out of 51. These events are therefore <strong>not independent</strong>: the first outcome affected the next one.</p>
 <p>To see an extreme case of non-independent events, consider our example of drawing five beads at random <strong>without</strong> replacement:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-12_3f2dfc185ad955f46a68e12e34eb8540">
 <div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">beads</span>, <span class="fl">5</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>If you have to guess the color of the first bead, you will predict blue since blue has a 60% chance. But if I show you the result of the last four outcomes:</p>
+<p>If you have to guess the color of the first bead, you will predict blue since blue has a 60% chance. However, if I show you the result of the last four outcomes:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-13_49717b8665b159405216536de10386eb">
 <div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span><span class="op">[</span><span class="fl">2</span><span class="op">:</span><span class="fl">5</span><span class="op">]</span></span>
 <span><span class="co">#&gt; [1] "blue" "blue" "blue" "red"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>would you still guess blue? Of course not. Now you know that the probability of red is 1 since the only bead left is red. The events are not independent, so the probabilities change.</p>
+<p>would you still guess blue? Of course not. Now, you know that the probability of red, as the the only bead left is red. The events are not independent, so the probabilities change.</p>
 </section><section id="conditional-probabilities" class="level2" data-number="3.6"><h2 data-number="3.6" class="anchored" data-anchor-id="conditional-probabilities">
 <span class="header-section-number">3.6</span> Conditional probabilities</h2>
-<p>When events are not independent, <em>conditional probabilities</em> are useful. We already saw an example of a conditional probability: we computed the probability that a second dealt card is a King given that the first was a King. In probability, we use the following notation:</p>
+<p>When events are not independent, <em>conditional probabilities</em> are useful. We have already demonstrated an example of a conditional probability: we computed the probability that a second dealt card is a King given that the first was a King. In probability, we use the following notation:</p>
 <p><span class="math display">\[
 \mbox{Pr}(\mbox{Card 2 is a king} \mid \mbox{Card 1 is a king}) = 3/51
 \]</span></p>
-<p>We use the <span class="math inline">\(\mid\)</span> as shorthand for “given that” or “conditional on”.</p>
+<p>We use the <span class="math inline">\(\mid\)</span> as shorthand for “given that” or “conditional on.”</p>
 <p>When two events, say <span class="math inline">\(A\)</span> and <span class="math inline">\(B\)</span>, are independent, we have:</p>
 <p><span class="math display">\[
 \mbox{Pr}(A \mid B) = \mbox{Pr}(A)
@@ -593,24 +599,24 @@ <h1 class="title">
 <span class="header-section-number">3.7</span> Addition and multiplication rules</h2>
 <section id="multiplication-rule" class="level3" data-number="3.7.1"><h3 data-number="3.7.1" class="anchored" data-anchor-id="multiplication-rule">
 <span class="header-section-number">3.7.1</span> Multiplication rule</h3>
-<p>If we want to know the probability of two events, say <span class="math inline">\(A\)</span> and <span class="math inline">\(B\)</span>, occurring, we can use the multiplication rule:</p>
+<p>If we want to determine the probability of two events, say <span class="math inline">\(A\)</span> and <span class="math inline">\(B\)</span>, occurring, we can use the multiplication rule:</p>
 <p><span class="math display">\[
 \mbox{Pr}(A \mbox{ and } B) = \mbox{Pr}(A)\mbox{Pr}(B \mid A)
 \]</span> Let’s use Blackjack as an example. In Blackjack, you are assigned two random cards. After you see what you have, you can ask for more. The goal is to get closer to 21 than the dealer, without going over. Face cards are worth 10 points and Aces are worth 11 or 1 (you choose).</p>
-<p>So, in a Blackjack game, to calculate the chances of getting a 21 by drawing an Ace and then a face card, we compute the probability of the first being an Ace and multiply by the probability of drawing a face card or a 10 given that the first was an Ace: <span class="math inline">\(1/13 \times 16/51 \approx 0.025\)</span></p>
+<p>In a Blackjack game, to calculate the chances of obtaining a 21 by drawing an Ace and then a face card, we compute the probability of the first card being an Ace and multiply it by the probability of drawing a face card or a 10, given that the first card was an Ace: <span class="math inline">\(1/13 \times 16/51 \approx 0.025\)</span></p>
 <p>The multiplication rule also applies to more than two events. We can use induction to expand for more events:</p>
 <p><span class="math display">\[
 \mbox{Pr}(A \mbox{ and } B \mbox{ and } C) = \mbox{Pr}(A)\mbox{Pr}(B \mid A)\mbox{Pr}(C \mid A \mbox{ and } B)
 \]</span></p>
 </section><section id="multiplication-rule-under-independence" class="level3" data-number="3.7.2"><h3 data-number="3.7.2" class="anchored" data-anchor-id="multiplication-rule-under-independence">
 <span class="header-section-number">3.7.2</span> Multiplication rule under independence</h3>
-<p>When we have independent events, then the multiplication rule becomes simpler:</p>
+<p>When dealing with independent events, the multiplication rule becomes simpler:</p>
 <p><span class="math display">\[
 \mbox{Pr}(A \mbox{ and } B \mbox{ and } C) = \mbox{Pr}(A)\mbox{Pr}(B)\mbox{Pr}(C)
 \]</span></p>
-<p>But we have to be very careful before using this since assuming independence can result in very different and incorrect probability calculations when we don’t actually have independence.</p>
-<p>As an example, imagine a court case in which the suspect was described as having a mustache and a beard. The defendant has a mustache and a beard and the prosecution brings in an “expert” to testify that 1/10 men have beards and 1/5 have mustaches, so using the multiplication rule we conclude that only <span class="math inline">\(1/10 \times 1/5\)</span> or 0.02 have both.</p>
-<p>But to multiply like this we need to assume independence! Say the conditional probability of a man having a mustache conditional on him having a beard is .95. So the correct calculation probability is much higher: <span class="math inline">\(1/10 \times 95/100 = 0.095\)</span>.</p>
+<p>However, we have to be very careful before using this version of the multiplication rule, since assuming independence can result in very different and incorrect probability calculations when events are not actually independent.</p>
+<p>As an example, imagine a court case in which the suspect was described as having a mustache and a beard. The defendant has a mustache and a beard, and the prosecution brings in an “expert” to testify that 1/10 men have beards and 1/5 have mustaches. Using the multiplication rule, we therefore conclude that only <span class="math inline">\(1/10 \times 1/5\)</span> or 0.02 have both.</p>
+<p>But, to multiply like this, we need to assume independence! Let’s say the conditional probability of a man having a mustache, conditional on him having a beard, is .95. Then, the correct calculation probability is much higher: <span class="math inline">\(1/10 \times 95/100 = 0.095\)</span>.</p>
 <p>The multiplication rule also gives us a general formula for computing conditional probabilities:</p>
 <p><span class="math display">\[
 \mbox{Pr}(B \mid A) = \frac{\mbox{Pr}(A \mbox{ and } B)}{ \mbox{Pr}(A)}
@@ -622,7 +628,7 @@ <h1 class="title">
 <p><span class="math display">\[
 \mbox{Pr}(A \mbox{ or } B) = \mbox{Pr}(A) + \mbox{Pr}(B) - \mbox{Pr}(A \mbox{ and } B)
 \]</span></p>
-<p>This rule is intuitive: think of a Venn diagram. If we simply add the probabilities, we count the intersection twice so we need to substract one instance.</p>
+<p>This rule is intuitive; consider a Venn diagram. If we simply add the probabilities, we count the intersection twice, so we need to subtract one instance.</p>
 <div class="cell" data-layout-align="center">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -633,9 +639,9 @@ <h1 class="title">
 </div>
 </section></section><section id="combinations-and-permutations" class="level2" data-number="3.8"><h2 data-number="3.8" class="anchored" data-anchor-id="combinations-and-permutations">
 <span class="header-section-number">3.8</span> Combinations and permutations</h2>
-<p>In our very first example, we imagined an urn with five beads. As a reminder, to compute the probability distribution of one draw, we simply listed out all the possibilities. There were 5 and so then, for each event, we counted how many of these possibilities were associated with the event. The resulting probability of choosing a blue bead is 3/5 because out of the five possible outcomes, three were blue.</p>
-<p>For more complicated cases, the computations are not as straightforward. For instance, what is the probability that if I draw five cards without replacement, I get all cards of the same suit, what is known as a “flush” in poker? In a discrete probability course you learn theory on how to make these computations. Here we focus on how to use R code to compute the answers.</p>
-<p>First, let’s construct a deck of cards. For this, we will use the <code>expand.grid</code> and <code>paste</code> functions. We use <code>paste</code> to create strings by joining smaller strings. To do this, we take the number and suit of a card and create the card name like this:</p>
+<p>In our very first example, we imagined an urn with five beads. As a reminder, to compute the probability distribution of a single draw, we simply listed out all the possibilities, which amounted to 5. Subsequently, for each event, we counted how many of these possibilities were associated with the event. The resulting probability of choosing a blue bead is 3/5, as three of the five possible outcomes were blue.</p>
+<p>For more complicated cases, the computations are not as straightforward. For instance, what is the probability that, if I draw five cards without replacement, I get all cards of the same suit, which is known as a “flush” in poker? In a discrete probability course, you learn theory on how to make these computations. Here, we focus on how to use R code to compute the answers.</p>
+<p>First, let’s construct a deck of cards. For this, we will use the <code>expand.grid</code> and <code>paste</code> functions. We use <code>paste</code> to create strings by joining smaller strings. To do this, we take the number and suit of a card, and create the card name like this:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-14_96178f6dd0e422b691a50e574f2c60cb">
 <div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">number</span> <span class="op">&lt;-</span> <span class="st">"Three"</span></span>
 <span><span class="va">suit</span> <span class="op">&lt;-</span> <span class="st">"Hearts"</span></span>
@@ -666,14 +672,14 @@ <h1 class="title">
 <span><span class="va">deck</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/expand.grid.html">expand.grid</a></span><span class="op">(</span>number <span class="op">=</span> <span class="va">numbers</span>, suit <span class="op">=</span> <span class="va">suits</span><span class="op">)</span></span>
 <span><span class="va">deck</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html">paste</a></span><span class="op">(</span><span class="va">deck</span><span class="op">$</span><span class="va">number</span>, <span class="va">deck</span><span class="op">$</span><span class="va">suit</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>With the deck constructed, we can double check that the probability of a King in the first card is 1/13 by computing the proportion of possible outcomes that satisfy our condition:</p>
+<p>With the deck constructed, we can double-check that the probability of a King as the first card is 1/13 by computing the proportion of possible outcomes that satisfy our condition:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-18_b4b06d6b3da80a7a60058b081f0c148d">
 <div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">kings</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html">paste</a></span><span class="op">(</span><span class="st">"King"</span>, <span class="va">suits</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">deck</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">kings</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.0769</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Now, how about the conditional probability of the second card being a King given that the first was a King? Earlier, we deduced that if one King is already out of the deck and there are 51 left, then this probability is 3/51. Let’s confirm by listing out all possible outcomes.</p>
-<p>To do this, we can use the <code>permutations</code> function from the <strong>gtools</strong> package. For any list of size <code>n</code>, this function computes all the different combinations we can get when we select <code>r</code> items. Here are all the ways we can choose two numbers from a list consisting of <code>1,2,3</code>:</p>
+<p>Now, what about the conditional probability of the second card being a King, given that the first card was a King? Earlier, we deduced that if one King is already drawn from the deck, leaving 51 cards, then this probability is 3/51. Let’s confirm by listing out all possible outcomes.</p>
+<p>To do this, we can use the <code>permutations</code> function from the <strong>gtools</strong> package. For any list of size <code>n</code>, this function computes all the different combinations we can obtain when we select <code>r</code> items. Here are all the ways we can choose two numbers from a list consisting of <code>1,2,3</code>:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-19_24c901755c75fa2d283783bdb954cfa1">
 <div class="sourceCode" id="cb18"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://github.com/r-gregmisc/gtools">gtools</a></span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/gtools/man/combinations.html">permutations</a></span><span class="op">(</span><span class="fl">3</span>, <span class="fl">2</span><span class="op">)</span></span>
@@ -685,7 +691,7 @@ <h1 class="title">
 <span><span class="co">#&gt; [5,]    3    1</span></span>
 <span><span class="co">#&gt; [6,]    3    2</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Notice that the order matters here: 3,1 is different than 1,3. Also, note that (1,1), (2,2), and (3,3) do not appear because once we pick a number, it can’t appear again.</p>
+<p>Observe that the order matters here: 3,1 is different than 1,3. Also, note that (1,1), (2,2), and (3,3) do not appear because once we pick a number, it can’t appear again.</p>
 <p>Optionally, we can add a vector. If you want to see five random seven digit phone numbers out of all possible phone numbers (without repeats), you can type:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-20_dfea6f4a7754db3633de2326c0ab307e">
 <div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">all_phone_numbers</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/gtools/man/combinations.html">permutations</a></span><span class="op">(</span><span class="fl">10</span>, <span class="fl">7</span>, v <span class="op">=</span> <span class="fl">0</span><span class="op">:</span><span class="fl">9</span><span class="op">)</span></span>
@@ -704,18 +710,18 @@ <h1 class="title">
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-21_299f4b95b8cb689d4d794c9845caf258">
 <div class="sourceCode" id="cb20"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">hands</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/pkg/gtools/man/combinations.html">permutations</a></span><span class="op">(</span><span class="fl">52</span>, <span class="fl">2</span>, v <span class="op">=</span> <span class="va">deck</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This is a matrix with two columns and 2652 rows. With a matrix we can get the first and second cards like this:</p>
+<p>This is a matrix with two columns and 2652 rows. With a matrix, we can obtain the first and second cards like this:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-22_c7a1825a718e220c2cd5ba4d71b90f23">
 <div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">first_card</span> <span class="op">&lt;-</span> <span class="va">hands</span><span class="op">[</span>,<span class="fl">1</span><span class="op">]</span></span>
 <span><span class="va">second_card</span> <span class="op">&lt;-</span> <span class="va">hands</span><span class="op">[</span>,<span class="fl">2</span><span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Now the cases for which the first hand was a King can be computed like this:</p>
+<p>Now, the cases for which the first hand was a King can be computed as follows:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-23_46c2bf7639e79fd0002826dbf2885e19">
 <div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">kings</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html">paste</a></span><span class="op">(</span><span class="st">"King"</span>, <span class="va">suits</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">first_card</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">kings</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 204</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To get the conditional probability, we compute what fraction of these have a King in the second card:</p>
+<p>To get the conditional probability, we compute what fraction of these have a King as the second card:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-24_e6ba7ca5d9311699db0fee458b29de16">
 <div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">first_card</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">kings</span> <span class="op">&amp;</span> <span class="va">second_card</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">kings</span><span class="op">)</span> <span class="op">/</span> </span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">first_card</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">kings</span><span class="op">)</span></span>
@@ -731,7 +737,7 @@ <h1 class="title">
 <p><span class="math display">\[
 \frac{\mbox{Pr}(A \mbox{ and } B)}{ \mbox{Pr}(A)}
 \]</span></p>
-<p>How about if the order doesn’t matter? For example, in Blackjack if you get an Ace and a face card in the first draw, it is called a <em>Natural 21</em> and you win automatically. If we wanted to compute the probability of this happening, we would enumerate the <em>combinations</em>, not the permutations, since the order does not matter.</p>
+<p>What about if the order does not matter? For example, in Blackjack, if you obtain an Ace and a face card in the first draw, it is called a <em>Natural 21</em>, and you win automatically. If we wanted to compute the probability of this happening, we would enumerate the <em>combinations</em>, not the permutations, since the order does not matter.</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-26_15025569c0672c7adfabda39cf21b448">
 <div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/gtools/man/combinations.html">combinations</a></span><span class="op">(</span><span class="fl">3</span>,<span class="fl">2</span><span class="op">)</span></span>
 <span><span class="co">#&gt;      [,1] [,2]</span></span>
@@ -752,7 +758,7 @@ <h1 class="title">
 <span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">hands</span><span class="op">[</span>,<span class="fl">1</span><span class="op">]</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">aces</span> <span class="op">&amp;</span> <span class="va">hands</span><span class="op">[</span>,<span class="fl">2</span><span class="op">]</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">facecard</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.0483</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>In the last line, we assume the Ace comes first. This is only because we know the way <code>combination</code> enumerates possibilities and it will list this case first. But to be safe, we could have written this and produced the same answer:</p>
+<p>In the last line, we assume the Ace comes first. This assumption is made based on our knowledge of how <code>combination</code> enumerates possibilities, and it will list this case first. However, to be safe, we could have written this and produced the same answer:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-28_c3066700daa9bcd6e36cee543b65c227">
 <div class="sourceCode" id="cb27"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="op">(</span><span class="va">hands</span><span class="op">[</span>,<span class="fl">1</span><span class="op">]</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">aces</span> <span class="op">&amp;</span> <span class="va">hands</span><span class="op">[</span>,<span class="fl">2</span><span class="op">]</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">facecard</span><span class="op">)</span> <span class="op">|</span></span>
 <span>       <span class="op">(</span><span class="va">hands</span><span class="op">[</span>,<span class="fl">2</span><span class="op">]</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">aces</span> <span class="op">&amp;</span> <span class="va">hands</span><span class="op">[</span>,<span class="fl">1</span><span class="op">]</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">facecard</span><span class="op">)</span><span class="op">)</span></span>
@@ -760,13 +766,13 @@ <h1 class="title">
 </div>
 <section id="monte-carlo-example" class="level3" data-number="3.8.1"><h3 data-number="3.8.1" class="anchored" data-anchor-id="monte-carlo-example">
 <span class="header-section-number">3.8.1</span> Monte Carlo example</h3>
-<p>Instead of using <code>combinations</code> to deduce the exact probability of a Natural 21, we can use a Monte Carlo to estimate this probability. In this case, we draw two cards over and over and keep track of how many 21s we get. We can use the function sample to draw two cards without replacements:</p>
+<p>Instead of using <code>combinations</code> to deduce the exact probability of a Natural 21, we can use a Monte Carlo to estimate this probability. In this case, we draw two cards over and over and keep track of how many 21s we obtain. We can use the function sample to draw two cards without replacement:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-29_8b7564846a164db8c4bc1125971ae5e9">
 <div class="sourceCode" id="cb28"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">hand</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">deck</span>, <span class="fl">2</span><span class="op">)</span></span>
 <span><span class="va">hand</span></span>
 <span><span class="co">#&gt; [1] "Queen Clubs"  "Seven Spades"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>And then check if one card is an Ace and the other a face card or a 10. Going forward, we include 10 when we say <em>face card</em>. Now we need to check both possibilities:</p>
+<p>and then check if one card is an Ace and the other is a face card or a 10. Going forward, we include 10 when we refer to a <em>face card</em>. Now, we need to check both possibilities:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-30_40c06d3f9efbfdd2ada4cf4de2077bee">
 <div class="sourceCode" id="cb29"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="op">(</span><span class="va">hands</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">aces</span> <span class="op">&amp;</span> <span class="va">hands</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">facecard</span><span class="op">)</span> <span class="op">|</span> </span>
 <span>  <span class="op">(</span><span class="va">hands</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">aces</span> <span class="op">&amp;</span> <span class="va">hands</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">facecard</span><span class="op">)</span></span>
@@ -781,7 +787,7 @@ <h1 class="title">
 <span>    <span class="op">(</span><span class="va">hand</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">aces</span> <span class="op">&amp;</span> <span class="va">hand</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="va">facecard</span><span class="op">)</span></span>
 <span><span class="op">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Here we do have to check both possibilities: Ace first or Ace second because we are not using the <code>combinations</code> function. The function returns <code>TRUE</code> if we get a 21 and <code>FALSE</code> otherwise:</p>
+<p>Here. we do have to check both possibilities: Ace first or Ace second, because we are not using the <code>combinations</code> function. The function returns <code>TRUE</code> if we get a 21 and <code>FALSE</code> otherwise:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-32_5bb535a4fcbdc58a8883f4d32a760c96">
 <div class="sourceCode" id="cb31"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu">blackjack</span><span class="op">(</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] FALSE</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -798,10 +804,10 @@ <h1 class="title">
 <p>In this section, we describe two discrete probability popular examples: the Monty Hall problem and the birthday problem. We use R to help illustrate the mathematical concepts.</p>
 <section id="monty-hall-problem" class="level3" data-number="3.9.1"><h3 data-number="3.9.1" class="anchored" data-anchor-id="monty-hall-problem">
 <span class="header-section-number">3.9.1</span> Monty Hall problem</h3>
-<p>In the 1970s, there was a game show called “Let’s Make a Deal” and Monty Hall was the host. At some point in the game, contestants were asked to pick one of three doors. Behind one door there was a prize. The other doors had a goat behind them to show the contestant they had lost. After the contestant picked a door, before revealing whether the chosen door contained a prize, Monty Hall would open one of the two remaining doors and show the contestant there was no prize behind that door. Then he would ask “Do you want to switch doors?” What would you do?</p>
-<p>We can use probability to show that if you stick with the original door choice, your chances of winning a prize remain 1 in 3. However, if you switch to the other door, your chances of winning double to 2 in 3! This seems counterintuitive. Many people incorrectly think both chances are 1 in 2 since you are choosing between 2 options. You can watch a detailed mathematical explanation on Khan Academy<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a> or read one on Wikipedia<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>. Below we use a Monte Carlo simulation to see which strategy is better. Note that this code is written longer than it should be for pedagogical purposes.</p>
+<p>In the 1970s, there was a game show called “Let’s Make a Deal,” with Monty Hall as the host. At some point in the game, contestants were asked to pick one of three doors. Behind one door, there was a prize, while the other doors had a goat behind them to show the contestant had lost. After the contestant picked a door, before revealing whether the chosen door contained a prize, Monty Hall would open one of the two remaining doors and reveal to the contestant that there was no prize behind that door. Then, he would ask, “Do you want to switch doors?” What would you do?</p>
+<p>We can use probability to demonstrate that if you stick with the original door choice, your chances of winning a prize remain 1 in 3. However, if you switch to the other door, your chances of winning double to 2 in 3! This might seem counterintuitive. Many people incorrectly think both chances are 1 in 2 since you are choosing between 2 options. You can watch a detailed mathematical explanation on Khan Academy<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a> or read one on Wikipedia<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>. Below, we use a Monte Carlo simulation to see which strategy is better. Note that this code is written longer than it should be for pedagogical purposes.</p>
 <p>Let’s start with the stick strategy:</p>
-<div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-34_680c186012ea775cdf321f3cf92fa449">
+<div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-34_a4c5d1b20b16bb81590eb7f5b80f3d82">
 <div class="sourceCode" id="cb33"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10000</span></span>
 <span><span class="va">monty_hall</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">strategy</span><span class="op">)</span><span class="op">{</span></span>
 <span>  <span class="va">doors</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/character.html">as.character</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">3</span><span class="op">)</span></span>
@@ -811,7 +817,7 @@ <h1 class="title">
 <span>  <span class="va">show</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">doors</span><span class="op">[</span><span class="op">!</span><span class="va">doors</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">my_pick</span>, <span class="va">prize_door</span><span class="op">)</span><span class="op">]</span>,<span class="fl">1</span><span class="op">)</span></span>
 <span>  <span class="va">stick</span> <span class="op">&lt;-</span> <span class="va">my_pick</span></span>
 <span>  <span class="va">stick</span> <span class="op">==</span> <span class="va">prize_door</span></span>
-<span>  <span class="va">switch</span> <span class="op">&lt;-</span> <span class="va">doors</span><span class="op">[</span><span class="op">!</span><span class="va">doors</span><span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">my_pick</span>, <span class="va">show</span><span class="op">)</span><span class="op">]</span></span>
+<span>  <span class="va">switch</span> <span class="op">&lt;-</span> <span class="va">doors</span><span class="op">[</span><span class="op">!</span><span class="va">doors</span> <span class="op"><a href="https://rdrr.io/r/base/match.html">%in%</a></span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="va">my_pick</span>, <span class="va">show</span><span class="op">)</span><span class="op">]</span></span>
 <span>  <span class="va">choice</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">strategy</span> <span class="op">==</span> <span class="st">"stick"</span>, <span class="va">stick</span>, <span class="va">switch</span><span class="op">)</span></span>
 <span>  <span class="va">choice</span> <span class="op">==</span> <span class="va">prize_door</span></span>
 <span><span class="op">}</span></span>
@@ -822,16 +828,16 @@ <h1 class="title">
 <span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">switch</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.668</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>As we write the code, we note that the lines starting with <code>my_pick</code> and <code>show</code> have no influence on the last logical operation when we stick to our original choice anyway. From this we should realize that the chance is 1 in 3, what we began with. When we switch, the Monte Carlo estimate confirms the 2/3 calculation. This helps us gain some insight by showing that we are removing a door, <code>show</code>, that is definitely not a winner from our choices. We also see that unless we get it right when we first pick, you win: 1 - 1/3 = 2/3.</p>
+<p>As we write the code, we see that the lines starting with <code>my_pick</code> and <code>show</code> have no influence on the last logical operation, when we stick to our original choice. From this, we should realize that the chance is 1 in 3, as we initially considered. When we switch, the Monte Carlo estimate confirms the 2/3 calculation. This helps us gain some insight by demonstrating that we are removing a door, <code>show</code>, that is definitely not a winner from our choices. We also see that unless we get it right when we first pick, we win 1 - 1/3 = 2/3 of the times.</p>
 </section><section id="birthday-problem" class="level3" data-number="3.9.2"><h3 data-number="3.9.2" class="anchored" data-anchor-id="birthday-problem">
 <span class="header-section-number">3.9.2</span> Birthday problem</h3>
-<p>Suppose you are in a classroom with 50 people. If we assume this is a randomly selected group of 50 people, what is the chance that at least two people have the same birthday? Although it is somewhat advanced, we can deduce this mathematically. We will do this later. Here we use a Monte Carlo simulation. For simplicity, we assume nobody was born on February 29. This actually doesn’t change the answer much.</p>
-<p>First, note that birthdays can be represented as numbers between 1 and 365, so a sample of 50 birthdays can be obtained like this:</p>
+<p>Suppose you are in a classroom with 50 people. If we assume this is a randomly selected group of 50 people, what is the chance that at least two people have the same birthday? Although it is somewhat advanced, we can deduce this mathematically. We will do that later. Here, we use a Monte Carlo simulation. For simplicity, we assume nobody was born on February 29, which doesn’t significantly change the answer.</p>
+<p>First, note that birthdays can be represented as numbers between 1 and 365, so a sample of 50 birthdays can be obtained as follows:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-36_50a9c8f05c4a8d848876c3bdc7b35357">
 <div class="sourceCode" id="cb34"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">50</span></span>
 <span><span class="va">bdays</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">365</span>, <span class="va">n</span>, replace <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>To check if in this particular set of 50 people we have at least two with the same birthday, we can use the function <code>duplicated</code>, which returns <code>TRUE</code> whenever an element of a vector is a duplicate. Here is an example:</p>
+<p>To check if there are at least two people with the same birthday in this particular set of 50 people, we can use the <code>duplicated</code> function, which returns <code>TRUE</code> whenever an element of a vector is a duplicate. Here is an example:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-37_66dcc6bc218077472cfd45a8c7b49f77">
 <div class="sourceCode" id="cb35"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/duplicated.html">duplicated</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">2</span>, <span class="fl">3</span>, <span class="fl">1</span>, <span class="fl">4</span>, <span class="fl">3</span>, <span class="fl">5</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -841,8 +847,8 @@ <h1 class="title">
 <div class="sourceCode" id="cb36"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/any.html">any</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/duplicated.html">duplicated</a></span><span class="op">(</span><span class="va">bdays</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] TRUE</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>In this case, we see that it did happen. At least two people had the same birthday.</p>
-<p>To estimate the probability of a shared birthday in the group, we repeat this experiment by sampling sets of 50 birthdays over and over:</p>
+<p>In this case, we see that it did happen; there were at least two people who had the same birthday.</p>
+<p>To estimate the probability of a shared birthday in the group, we repeat this experiment by repeatedly sampling sets of 50 birthdays:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/birthday-problem_b85af4850ded37a5e960345d33022778">
 <div class="sourceCode" id="cb37"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10000</span></span>
 <span><span class="va">same_birthday</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">n</span><span class="op">)</span><span class="op">{</span></span>
@@ -855,7 +861,7 @@ <h1 class="title">
 </div>
 <p>Were you expecting the probability to be this high?</p>
 <p>People tend to underestimate these probabilities. To get an intuition as to why it is so high, think about what happens when the group size is close to 365. At this stage, we run out of days and the probability is one.</p>
-<p>Say we want to use this knowledge to bet with friends about two people having the same birthday in a group of people. When are the chances larger than 50%? Larger than 75%?</p>
+<p>Let’s say we want to use this knowledge to make bet with friends about the likelihood of two people sharing the same birthday in a group. At what group size do the chances become greater than 50%? Greater than 75%?</p>
 <p>Let’s create a look-up table. We can quickly create a function to compute this for any group size:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-39_6ababfe3a439c4992ed80bac346a73f7">
 <div class="sourceCode" id="cb38"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">compute_prob</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">n</span>, <span class="va">B</span> <span class="op">=</span> <span class="fl">10000</span><span class="op">)</span><span class="op">{</span></span>
@@ -880,16 +886,16 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>Now let’s compute the exact probabilities rather than use Monte Carlo approximations. Not only do we get the exact answer using math, but the computations are much faster since we don’t have to generate experiments.</p>
+<p>Now, let’s compute the exact probabilities instead of relying on Monte Carlo approximations. Not only do we obtain the exact answer using math, but the computations are much faster since we did not have to generate experiments.</p>
 <p>To make the math simpler, instead of computing the probability of it happening, we will compute the probability of it not happening. For this, we use the multiplication rule.</p>
-<p>Let’s start with the first person. The probability that person 1 has a unique birthday is 1. The probability that person 2 has a unique birthday, given that person 1 already took one, is 364/365. Then, given that the first two people have unique birthdays, person 3 is left with 363 days to choose from. We continue this way and find the chances of all 50 people having a unique birthday is:</p>
+<p>Let’s start with the first person. The probability that person 1 has a unique birthday is 1. The probability that person 2 has a unique birthday, given that person 1 already took one day, is 364/365. Subsequently, given that the first two people have unique birthdays, person 3 is left with 363 days to choose from. This pattern continues, and we find that the chances of all 50 people having a unique birthday is:</p>
 <p><span class="math display">\[
 1 \times \frac{364}{365}\times\frac{363}{365} \dots \frac{365-n + 1}{365}
 \]</span></p>
 <p>We can write a function that does this for any number:</p>
-<div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/birthday-problem-exact-probabilities_6840c8ffb91027ce9aaf0098a4c7bced">
+<div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/birthday-problem-exact-probabilities_843d6f3c19ad94ad7e174254561f0ec2">
 <div class="sourceCode" id="cb41"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">exact_prob</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">n</span><span class="op">)</span><span class="op">{</span></span>
-<span>  <span class="va">prob_unique</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">365</span>,<span class="fl">365</span> <span class="op">-</span> <span class="va">n</span> <span class="op">+</span> <span class="fl">1</span><span class="op">)</span><span class="op">/</span><span class="fl">365</span> </span>
+<span>  <span class="va">prob_unique</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">365</span>, <span class="fl">365</span> <span class="op">-</span> <span class="va">n</span> <span class="op">+</span> <span class="fl">1</span><span class="op">)</span><span class="op">/</span><span class="fl">365</span> </span>
 <span>  <span class="fl">1</span> <span class="op">-</span> <span class="fu"><a href="https://rdrr.io/r/base/prod.html">prod</a></span><span class="op">(</span> <span class="va">prob_unique</span><span class="op">)</span></span>
 <span><span class="op">}</span></span>
 <span><span class="va">eprob</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">n</span>, <span class="va">exact_prob</span><span class="op">)</span></span>
@@ -901,14 +907,15 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>This plot shows that the Monte Carlo simulation provided a very good estimate of the exact probability. Had it not been possible to compute the exact probabilities, we would have still been able to accurately estimate the probabilities.</p>
+<p>This plot shows that the Monte Carlo simulation provided a very good estimate of the exact probability. Had we not been able to compute the exact probabilities, we could still accurately estimate them.</p>
 </section></section><section id="infinity-in-practice" class="level2" data-number="3.10"><h2 data-number="3.10" class="anchored" data-anchor-id="infinity-in-practice">
 <span class="header-section-number">3.10</span> Infinity in practice</h2>
-<p>The theory described here requires repeating experiments over and over forever. In practice we can’t do this. In the examples above, we used <span class="math inline">\(B=10,000\)</span> Monte Carlo experiments and it turned out that this provided accurate estimates. The larger this number, the more accurate the estimate becomes until the approximaton is so good that your computer can’t tell the difference. But in more complex calculations, 10,000 may not be nearly enough. Also, for some calculations, 10,000 experiments might not be computationally feasible. In practice, we won’t know what the answer is, so we won’t know if our Monte Carlo estimate is accurate. We know that the larger <span class="math inline">\(B\)</span>, the better the approximation. But how big do we need it to be? This is actually a challenging question and answering it often requires advanced theoretical statistics training.</p>
-<p>One practical approach we will describe here is to check for the stability of the estimate. The following is an example with the birthday problem for a group of 25 people.</p>
-<div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/monte-carlo-convergence_491acce0d18a7bce67c3ace79ab75d7b">
+<p>The theory described here requires repeating experiments over and over indefinitely. In practice, we can’t do this. In the examples above, we used <span class="math inline">\(B=10,000\)</span> Monte Carlo experiments, yielding accurate estimates. The larger this number, the more accurate the estimate becomes, until the approximation is so good that your computer can’t tell the difference. However, in more complex calculations, 10,000 may not be nearly enough. Moreover, for some calculations, 10,000 experiments might not be computationally feasible.</p>
+<p>In practical scenarios, we won’t know what the answer is beforehand, so we won’t know if our Monte Carlo estimate is accurate. We know that the larger the <span class="math inline">\(B\)</span>, the better the approximation. But how large do we need it to be? This is actually a challenging question, and answering it often requires advanced theoretical statistics training.</p>
+<p>One practical approach is to check for the stability of the estimate. The following example illustrates the birthday problem for a group of 25 people.</p>
+<div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/monte-carlo-convergence_bc8f3aaae6a915207217588e678495a0">
 <div class="sourceCode" id="cb42"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10</span><span class="op">^</span><span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">1</span>, <span class="fl">5</span>, len <span class="op">=</span> <span class="fl">100</span><span class="op">)</span></span>
-<span><span class="va">compute_prob</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">B</span>, <span class="va">n</span><span class="op">=</span><span class="fl">25</span><span class="op">)</span><span class="op">{</span></span>
+<span><span class="va">compute_prob</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">B</span>, <span class="va">n</span> <span class="op">=</span> <span class="fl">25</span><span class="op">)</span><span class="op">{</span></span>
 <span>  <span class="va">same_day</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="fu">same_birthday</span><span class="op">(</span><span class="va">n</span><span class="op">)</span><span class="op">)</span></span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">same_day</span><span class="op">)</span></span>
 <span><span class="op">}</span></span>
@@ -921,23 +928,23 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>In this plot, we can see that the values start to stabilize (that is, they vary less than .01) around 1000. Note that the exact probability, which we know in this case, is 0.5686997.</p>
+<p>In this plot, we can see that the values start to stabilize at around 1000. Note that the exact probability, which is known in this case, is 0.5686997.</p>
 </section><section id="exercises" class="level2" data-number="3.11"><h2 data-number="3.11" class="anchored" data-anchor-id="exercises">
 <span class="header-section-number">3.11</span> Exercises</h2>
 <p>1. One ball will be drawn at random from a box containing: 3 cyan balls, 5 magenta balls, and 7 yellow balls. What is the probability that the ball will be cyan?</p>
 <p>2. What is the probability that the ball will not be cyan?</p>
-<p>3. Instead of taking just one draw, consider taking two draws. You take the second draw without returning the first draw to the box. We call this sampling <strong>without</strong> replacement. What is the probability that the first draw is cyan and that the second draw is not cyan?</p>
-<p>4. Now repeat the experiment, but this time, after taking the first draw and recording the color, return it to the box and shake the box. We call this sampling <strong>with</strong> replacement. What is the probability that the first draw is cyan and that the second draw is not cyan?</p>
+<p>3. Instead of taking just one draw, consider taking two draws. You take the second draw without returning the first draw to the box. We call this sampling <strong>without</strong> replacement. What is the probability of the first draw being cyan and the second draw not being cyan?</p>
+<p>4. Now repeat the experiment, but this time, after taking the first draw and recording the color, return it to the box and shake the box. We call this sampling <strong>with</strong> replacement. What is the probability of the first draw being cyan and the second draw not being cyan?</p>
 <p>5. Two events <span class="math inline">\(A\)</span> and <span class="math inline">\(B\)</span> are independent if <span class="math inline">\(\mbox{Pr}(A \mbox{ and } B) = \mbox{Pr}(A) P(B)\)</span>. Under which situation are the draws independent?</p>
 <ol type="a">
 <li>You don’t replace the draw.</li>
 <li>You replace the draw.</li>
-<li>Neither</li>
-<li>Both</li>
+<li>Neither.</li>
+<li>Both.</li>
 </ol>
-<p>6. Say you’ve drawn 5 balls from the box, with replacement, and all have been yellow. What is the probability that the next one is yellow?</p>
+<p>6. Let’s say you’ve drawn 5 balls from the box, with replacement, and all have been yellow. What is the probability that the next one will be yellow?</p>
 <p>7. If you roll a 6-sided die six times, what is the probability of not seeing a 6?</p>
-<p>8. Two teams, say the Celtics and the Cavs, are playing a seven game series. The Cavs are a better team and have a 60% chance of winning each game. What is the probability that the Celtics win <strong>at least</strong> one game?</p>
+<p>8. Two teams, let’s say the Celtics and the Cavs, are playing a seven game series. The Cavs are a better team and have a 60% chance of winning each game. What is the probability that the Celtics will win <strong>at least</strong> one game?</p>
 <p>9. Create a Monte Carlo simulation to confirm your answer to the previous problem. Use <code>B &lt;- 10000</code> simulations. Hint: use the following code to generate the results of the first four games:</p>
 <div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-41_ba35038af67ae2daa7456ef0667f1613">
 <div class="sourceCode" id="cb43"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">celtic_wins</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span><span class="op">)</span>, <span class="fl">4</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0.6</span>, <span class="fl">0.4</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -946,24 +953,24 @@ <h1 class="title">
 <p>10. Two teams, say the Cavs and the Warriors, are playing a seven game championship series. The first to win four games, therefore, wins the series. The teams are equally good so they each have a 50-50 chance of winning each game. If the Cavs lose the first game, what is the probability that they win the series?</p>
 <p>11. Confirm the results of the previous question with a Monte Carlo simulation.</p>
 <p>12. Two teams, <span class="math inline">\(A\)</span> and <span class="math inline">\(B\)</span>, are playing a seven game series. Team <span class="math inline">\(A\)</span> is better than team <span class="math inline">\(B\)</span> and has a <span class="math inline">\(p&gt;0.5\)</span> chance of winning each game. Given a value <span class="math inline">\(p\)</span>, the probability of winning the series for the underdog team <span class="math inline">\(B\)</span> can be computed with the following function based on a Monte Carlo simulation:</p>
-<div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-42_8f3bbe90b8857bcff019f2e6a53822d2">
+<div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-42_8cea7c7b38b90e593e2416d956d6321b">
 <div class="sourceCode" id="cb44"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">prob_win</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">p</span><span class="op">)</span><span class="op">{</span></span>
 <span>  <span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10000</span></span>
 <span>  <span class="va">result</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="op">{</span></span>
-<span>    <span class="va">b_win</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span>,<span class="fl">0</span><span class="op">)</span>, <span class="fl">7</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span><span class="op">-</span><span class="va">p</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span>
+<span>    <span class="va">b_win</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span>,<span class="fl">0</span><span class="op">)</span>, <span class="fl">7</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span>
 <span>    <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">b_win</span><span class="op">)</span><span class="op">&gt;=</span><span class="fl">4</span></span>
 <span>  <span class="op">}</span><span class="op">)</span></span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">result</span><span class="op">)</span></span>
 <span><span class="op">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Use the function <code>sapply</code> to compute the probability, call it <code>Pr</code>, of winning for <code>p &lt;- seq(0.5, 0.95, 0.025)</code>. Then plot the result.</p>
-<p>13. Repeat the exercise above, but now keep the probability fixed at <code>p &lt;- 0.75</code> and compute the probability for different series lengths: best of 1 game, 3 games, 5 games,… Specifically, <code>N &lt;- seq(1, 25, 2)</code>. Hint: use this function:</p>
-<div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-43_7ad74dccdcd70d962a66cc37dd04d6fe">
-<div class="sourceCode" id="cb45"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">prob_win</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">N</span>, <span class="va">p</span><span class="op">=</span><span class="fl">0.75</span><span class="op">)</span><span class="op">{</span></span>
+<p>13. Repeat the exercise above, but now keep the probability fixed at <code>p &lt;- 0.75</code> and compute the probability for different series lengths: best of 1 game, best of 3 games, best of 5 games,… Specifically, <code>N &lt;- seq(1, 25, 2)</code>. Hint: use the function below.</p>
+<div class="cell" data-layout-align="center" data-hash="discrete-probability_cache/html/unnamed-chunk-43_1621927f782f2036e119ee913c606f97">
+<div class="sourceCode" id="cb45"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">prob_win</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">N</span>, <span class="va">p</span> <span class="op">=</span> <span class="fl">0.75</span><span class="op">)</span><span class="op">{</span></span>
 <span>  <span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10000</span></span>
 <span>  <span class="va">result</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="op">{</span></span>
-<span>    <span class="va">b_win</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span>,<span class="fl">0</span><span class="op">)</span>, <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span><span class="op">-</span><span class="va">p</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span>
-<span>    <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">b_win</span><span class="op">)</span><span class="op">&gt;=</span><span class="op">(</span><span class="va">N</span><span class="op">+</span><span class="fl">1</span><span class="op">)</span><span class="op">/</span><span class="fl">2</span></span>
+<span>    <span class="va">b_win</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span>,<span class="fl">0</span><span class="op">)</span>, <span class="va">N</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span> <span class="op">-</span> <span class="va">p</span>, <span class="va">p</span><span class="op">)</span><span class="op">)</span></span>
+<span>    <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">b_win</span><span class="op">)</span> <span class="op">&gt;=</span> <span class="op">(</span><span class="va">N</span><span class="op">+</span><span class="fl">1</span><span class="op">)</span><span class="op">/</span><span class="fl">2</span></span>
 <span>  <span class="op">}</span><span class="op">)</span></span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">result</span><span class="op">)</span></span>
 <span><span class="op">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
diff --git a/docs/prob/intro-to-prob.html b/docs/prob/intro-to-prob.html
index 7be34a7..c666a79 100644
--- a/docs/prob/intro-to-prob.html
+++ b/docs/prob/intro-to-prob.html
@@ -203,23 +203,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -236,37 +242,37 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -283,31 +289,31 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -324,49 +330,49 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -399,10 +405,10 @@ <h1 class="title">Probability</h1>
 
 </header>
 
-<p>In games of chance, probability has a very intuitive definition. However, this is not the case in other contexts. Today probability theory is being used much more broadly with the word <em>probability</em> commonly used in everyday language. Google’s auto-complete of “What are the chances of” give us: “having twins”, “rain today”, “getting struck by lightning”, and “getting cancer”. One of the goals of this part of the book is to help us understand how probability is useful to understand and describe real-world events when performing data analysis. Probability theory is useful any time our data is affected by chance in some way. All of the other chapters in this book build upon probability theory. Knowledge of probability is therefore indispensable for most data analysis challenges.</p>
-<p>Because knowing how to compute probabilities gives you an edge in games of chance, throughout history many smart individuals, including famous mathematicians such as Cardano, Fermat, and Pascal, spent time and energy thinking through the math of these games. As a result, Probability Theory was born. Probability continues to be highly useful in modern games of chance. For example, in poker, we can compute the probability of winning a hand based on the cards on the table. Also, casinos rely on probability theory to develop games that almost certainly guarantee a profit. We will use casino games to illustrate the basic concepts.</p>
+<p>In games of chance, probability has a very intuitive definition. However, this is not the case in other contexts. Today, probability theory is being used much more broadly with the word <em>probability</em> commonly used in everyday language. Google’s auto-complete of “What are the chances of” give us: “having twins”, “rain today”, “getting struck by lightning”, and “getting cancer”. One of the goals of this section of the book is to help us in comprehending how probability is useful for understanding and describing real-world events when performing data analysis. Probability theory is useful whenever our data is affected by chance in some manner. All the other sections in this book build upon probability theory. A knowledge of probability is therefore indispensable for addressing most data analysis challenges.</p>
+<p>Given that knowing how to compute probabilities gives strategic advantage in games of chance, many smart individuals throughout history, including famous mathematicians such as Cardano, Fermat, and Pascal, spent time and energy thinking through the math of these games. As a result, Probability Theory was born. Probability continues to be highly useful in modern games of chance. For example, in poker, we can compute the probability of winning a hand based on the cards on the table. Additionally, casinos depend on probability theory to develop games that almost certainly guarantee a profit. We will use casino games to illustrate the fundamental concepts.</p>
 <p>This part of the book discusses concepts that can be found in many comprehensive books on probability theory. These books delve into the mathematical theories and formulas behind probability.</p>
-<p>However, this book takes a different approach. Instead of diving into the mathematical theories, it uses R to demonstrate these concepts. This helps readers visualize and better understand the principles of probability in practical terms, as they can see the results and outcomes by running code.</p>
+<p>This book, however, takes a different approach. Instead of diving into the mathematical theories, it uses R to demonstrate these concepts. This helps readers visualize and better understand the principles of probability in practical terms, as they can see the results and outcomes by running code.</p>
 <p>Despite this practical approach, the book does not immediately apply these probability concepts to real-world data. This connection between probability theory and real data will be made in a subsequent section or part of the book.</p>
 <p>In other words, while you’re learning about probability now, it’ll be a bit longer before you see how these concepts relate directly to real datasets.</p>
 
diff --git a/docs/prob/random-variables-sampling-models-clt.html b/docs/prob/random-variables-sampling-models-clt.html
index fc57961..af88529 100644
--- a/docs/prob/random-variables-sampling-models-clt.html
+++ b/docs/prob/random-variables-sampling-models-clt.html
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -437,8 +443,8 @@ <h1 class="title">
   </div>
   
 
-</header><p>In data science, we often deal with data that is affected by chance in some way: the data comes from a random sample, the data is affected by measurement error, or the data measures some outcome that is random in nature. Being able to quantify the uncertainty introduced by randomness is one of the most important jobs of a data analyst. Statistical inference offers a framework, as well as several practical tools, for doing this. The first step is to learn how to mathematically describe random variables.</p>
-<p>In this chapter, we introduce random variables and their properties starting with their application to games of chance. We then describe some of the events surrounding the financial crisis of 2007-2008<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> using probability theory. This financial crisis was in part caused by underestimating the risk of certain securities<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a> sold by financial institutions. Specifically, the risks of mortgage-backed securities (MBS) and collateralized debt obligations (CDO) were grossly underestimated. These assets were sold at prices that assumed most homeowners would make their monthly payments, and the probability of this not occurring was calculated as being low. A combination of factors resulted in many more defaults than were expected, which led to a price crash of these securities. As a consequence, banks lost so much money that they needed government bailouts to avoid closing down completely.</p>
+</header><p>In data science, we often work with data that is affected by chance, whether it comes from a random sample, is subject to measurement error, or measures some outcome that is random in nature. Being able to quantify the uncertainty introduced by randomness is one of the most important jobs of a data analyst. Statistical inference offers a framework, as well as several practical tools, for accomplishing this. The first step is learning how to mathematically describe random variables.</p>
+<p>In this chapter, we introduce random variables and their properties, starting with their application to games of chance. We then describe some of the events surrounding the financial crisis of 2007-2008<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> using probability theory. This financial crisis was in part caused by underestimating the risk of certain securities<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a> sold by financial institutions. Specifically, the risks of mortgage-backed securities (MBS) and collateralized debt obligations (CDO) were grossly underestimated. These assets were sold at prices that assumed most homeowners would make their monthly payments, and the probability of this not occurring was calculated as being low. A combination of factors resulted in many more defaults than were expected, which led to a price crash of these securities. As a consequence, banks lost so much money that they required government bailouts to avoid complete closure.</p>
 <section id="random-variables" class="level2" data-number="5.1"><h2 data-number="5.1" class="anchored" data-anchor-id="random-variables">
 <span class="header-section-number">5.1</span> Random variables</h2>
 <p>Random variables are numeric outcomes resulting from random processes. We can easily generate random variables using some of the simple examples we have shown. For example, define <code>X</code> to be 1 if a bead is blue and red otherwise:</p>
@@ -446,7 +452,7 @@ <h1 class="title">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">beads</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html">rep</a></span><span class="op">(</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"red"</span>, <span class="st">"blue"</span><span class="op">)</span>, times <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">2</span>,<span class="fl">3</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">beads</span>, <span class="fl">1</span><span class="op">)</span> <span class="op">==</span> <span class="st">"blue"</span>, <span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Here <code>X</code> is a random variable: every time we select a new bead the outcome changes randomly. See below:</p>
+<p>Here <code>X</code> is a random variable, changing randomly each time we select a new bead. See below:</p>
 <div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-3_a92fb94e075e7ae2e43dead4dfaa489e">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="va">beads</span>, <span class="fl">1</span><span class="op">)</span> <span class="op">==</span> <span class="st">"blue"</span>, <span class="fl">1</span>, <span class="fl">0</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 1</span></span>
@@ -458,13 +464,13 @@ <h1 class="title">
 <p>Sometimes it’s 1 and sometimes it’s 0.</p>
 </section><section id="sampling-models" class="level2" data-number="5.2"><h2 data-number="5.2" class="anchored" data-anchor-id="sampling-models">
 <span class="header-section-number">5.2</span> Sampling models</h2>
-<p>Many data generation procedures, those that produce the data we study, can be modeled quite well as draws from an urn. For instance, we can model the process of polling likely voters as drawing 0s (Republicans) and 1s (Democrats) from an urn containing the 0 and 1 code for all likely voters. In epidemiological studies, we often assume that the subjects in our study are a random sample from the population of interest. The data related to a specific outcome can be modeled as a random sample from an urn containing the outcome for the entire population of interest. Similarly, in experimental research, we often assume that the individual organisms we are studying, for example worms, flies, or mice, are a random sample from a larger population. Randomized experiments can also be modeled by draws from an urn given the way individuals are assigned into groups: when getting assigned, you draw your group at random. Sampling models are therefore ubiquitous in data science. Casino games offer a plethora of examples of real-world situations in which sampling models are used to answer specific questions. We will therefore start with such examples.</p>
-<p>Suppose a very small casino hires you to consult on whether they should set up roulette wheels. To keep the example simple, we will assume that 1,000 people will play and that the only game you can play on the roulette wheel is to bet on red or black. The casino wants you to predict how much money they will make or lose. They want a range of values and, in particular, they want to know what’s the chance of losing money. If this probability is too high, they will pass on installing roulette wheels.</p>
+<p>Many data generation procedures, those that produce the data we study, can be effectively modeled as draws from an urn. For instance, we can model the process of polling likely voters as drawing 0s (Republicans) and 1s (Democrats) from an urn containing the 0 and 1 codes for all likely voters. In epidemiological studies, we often assume that the subjects in our study are a random sample from the population of interest. The data related to a specific outcome can be modeled as a random sample from an urn containing the outcomes for the entire population of interest. Similarly, in experimental research, we often assume that the individual organisms we are studying, for example worms, flies, or mice, are a random sample from a larger population. Randomized experiments can be modeled by draws from an urn, reflecting the way individuals are assigned into group; when getting assigned, individuals draw their group at random. Sampling models are therefore ubiquitous in data science. Casino games offer a plethora of real-world cases in which sampling models are used to answer specific questions. We will therefore start with these examples.</p>
+<p>Suppose a very small casino hires you to consult on whether they should set up roulette wheels. To keep the example simple, we will assume that 1,000 people will play, and that the only game available on the roulette wheel is to bet on red or black. The casino wants you to predict how much money they will make or lose. They want a range of values and, in particular, they want to know what’s the chance of losing money. If this probability is too high, they will decide against installing roulette wheels.</p>
 <p>We are going to define a random variable <span class="math inline">\(S\)</span> that will represent the casino’s total winnings. Let’s start by constructing the urn. A roulette wheel has 18 red pockets, 18 black pockets and 2 green ones. So playing a color in one game of roulette is equivalent to drawing from this urn:</p>
 <div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-4_395fb63cf168b6561f84e16ee9ce50d0">
 <div class="sourceCode" id="cb3"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">color</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/rep.html">rep</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="st">"Black"</span>, <span class="st">"Red"</span>, <span class="st">"Green"</span><span class="op">)</span>, <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">18</span>, <span class="fl">18</span>, <span class="fl">2</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The 1,000 outcomes from 1,000 people playing are independent draws from this urn. If red comes up, the gambler wins and the casino loses a dollar, so we draw a -$1. Otherwise, the casino wins a dollar and we draw a $1. To construct our random variable <span class="math inline">\(S\)</span>, we can use this code:</p>
+<p>The 1,000 outcomes from 1,000 people playing are independent draws from this urn. If red comes up, the gambler wins, and the casino loses a dollar, resulting in a draw a -$1. Otherwise, the casino wins a dollar, and we draw a $1. To construct our random variable <span class="math inline">\(S\)</span>, we can use this code:</p>
 <div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-5_747c80abd9176c7e8ccb9e43bcddb1af">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
 <span><span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/ifelse.html">ifelse</a></span><span class="op">(</span><span class="va">color</span> <span class="op">==</span> <span class="st">"Red"</span>, <span class="op">-</span><span class="fl">1</span>, <span class="fl">1</span><span class="op">)</span>,  <span class="va">n</span>, replace <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
@@ -472,10 +478,10 @@ <h1 class="title">
 <span><span class="co">#&gt;  [1] -1  1  1 -1 -1 -1  1  1  1  1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Because we know the proportions of 1s and -1s, we can generate the draws with one line of code, without defining <code>color</code>:</p>
-<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-6_c884ae55eced55ce868e4028311ecc6f">
-<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>,<span class="fl">1</span><span class="op">)</span>, <span class="va">n</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob<span class="op">=</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">9</span><span class="op">/</span><span class="fl">19</span>, <span class="fl">10</span><span class="op">/</span><span class="fl">19</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-6_035f57a1d0e21d054c74316773f3d6d3">
+<div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>, <span class="fl">1</span><span class="op">)</span>, <span class="va">n</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">9</span><span class="op">/</span><span class="fl">19</span>, <span class="fl">10</span><span class="op">/</span><span class="fl">19</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>We call this a <strong>sampling model</strong> since we are modeling the random behavior of roulette with the sampling of draws from an urn. The total winnings <span class="math inline">\(S\)</span> is simply the sum of these 1,000 independent draws:</p>
+<p>We call this a <strong>sampling model</strong>, as it involves modeling the random behavior of roulette through the sampling of draws from an urn. The total winnings <span class="math inline">\(S\)</span> is simply the sum of these 1,000 independent draws:</p>
 <div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-7_c5ff607bda531993417a3285d81ea8b4">
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>, <span class="fl">1</span><span class="op">)</span>, <span class="va">n</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">9</span><span class="op">/</span><span class="fl">19</span>, <span class="fl">10</span><span class="op">/</span><span class="fl">19</span><span class="op">)</span><span class="op">)</span></span>
 <span><span class="va">S</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">X</span><span class="op">)</span></span>
@@ -484,23 +490,23 @@ <h1 class="title">
 </div>
 </section><section id="the-probability-distribution-of-a-random-variable" class="level2" data-number="5.3"><h2 data-number="5.3" class="anchored" data-anchor-id="the-probability-distribution-of-a-random-variable">
 <span class="header-section-number">5.3</span> The probability distribution of a random variable</h2>
-<p>If you run the code above, you see that <span class="math inline">\(S\)</span> changes every time. This is, of course, because <span class="math inline">\(S\)</span> is a <strong>random variable</strong>. The probability distribution of a random variable tells us the probability of the observed value falling at any given interval. So, for example, if we want to know the probability that we lose money, we are asking the probability that <span class="math inline">\(S\)</span> is in the interval <span class="math inline">\(S&lt;0\)</span>.</p>
-<p>Note that if we can define a cumulative distribution function <span class="math inline">\(F(a) = \mbox{Pr}(S\leq a)\)</span>, then we will be able to answer any question related to the probability of events defined by our random variable <span class="math inline">\(S\)</span>, including the event <span class="math inline">\(S&lt;0\)</span>. We call this <span class="math inline">\(F\)</span> the random variable’s <em>distribution function</em>.</p>
-<p>We can estimate the distribution function for the random variable <span class="math inline">\(S\)</span> by using a Monte Carlo simulation to generate many realizations of the random variable. With this code, we run the experiment of having 1,000 people play roulette, over and over, specifically <span class="math inline">\(B = 10,000\)</span> times:</p>
-<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-8_66e6fe2ffc18c9df7a56d754e4a9f0a2">
+<p>If you run the code above, you see that <span class="math inline">\(S\)</span> changes every time. This is, of course, because <span class="math inline">\(S\)</span> is a <strong>random variable</strong>. The probability distribution of a random variable informs us about the probability of the observed value falling at any given interval. For example, if we want to know the probability that we lose money, we are asking the probability that <span class="math inline">\(S\)</span> is in the interval <span class="math inline">\(S&lt;0\)</span>.</p>
+<p>Keep in mind that if we can define a cumulative distribution function <span class="math inline">\(F(a) = \mbox{Pr}(S\leq a)\)</span>, then we will be able to answer any question related to the probability of events defined by our random variable <span class="math inline">\(S\)</span>, including the event <span class="math inline">\(S&lt;0\)</span>. We call this <span class="math inline">\(F\)</span> the random variable’s <em>distribution function</em>.</p>
+<p>We can estimate the distribution function for the random variable <span class="math inline">\(S\)</span> by using a Monte Carlo simulation to generate many realizations of the random variable. With this code, we run the experiment of having 1,000 people repeatedly play roulette, specifically <span class="math inline">\(B = 10,000\)</span> times:</p>
+<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-8_c8f5a60631dbae6274fe7003d47065f8">
 <div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
 <span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10000</span></span>
 <span><span class="va">roulette_winnings</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">n</span><span class="op">)</span><span class="op">{</span></span>
-<span>  <span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>,<span class="fl">1</span><span class="op">)</span>, <span class="va">n</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">9</span><span class="op">/</span><span class="fl">19</span>, <span class="fl">10</span><span class="op">/</span><span class="fl">19</span><span class="op">)</span><span class="op">)</span></span>
+<span>  <span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>, <span class="fl">1</span><span class="op">)</span>, <span class="va">n</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">9</span><span class="op">/</span><span class="fl">19</span>, <span class="fl">10</span><span class="op">/</span><span class="fl">19</span><span class="op">)</span><span class="op">)</span></span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">X</span><span class="op">)</span></span>
 <span><span class="op">}</span></span>
 <span><span class="va">S</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="fu">roulette_winnings</span><span class="op">(</span><span class="va">n</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Now we can ask the following: in our simulations, how often did we get sums less than or equal to <code>a</code>?</p>
+<p>Now, we can ask the following: in our simulations, how often did we get sums less than or equal to <code>a</code>?</p>
 <div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-9_9a9b544060b3940ffd160f58fafe9cb0">
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">S</span> <span class="op">&lt;=</span> <span class="va">a</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>This will be a very good approximation of <span class="math inline">\(F(a)\)</span> and we can easily answer the casino’s question: how likely is it that we will lose money? We can see it is quite low:</p>
+<p>This will be a very good approximation of <span class="math inline">\(F(a)\)</span>, allowing us to easily answer the casino’s question: How likely is it that we will lose money? We can see it is quite low:</p>
 <div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-10_eaf2277fdcd5abed19f2dc72038f6823">
 <div class="sourceCode" id="cb9"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">S</span> <span class="op">&lt;</span> <span class="fl">0</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.0456</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -514,60 +520,61 @@ <h1 class="title">
 </div>
 </div>
 </div>
-<p>We see that the distribution appears to be approximately normal. A qq-plot will confirm that the normal approximation is close to a perfect approximation for this distribution. If, in fact, the distribution is normal, then all we need to define the distribution is the average and the standard deviation. Because we have the original values from which the distribution is created, we can easily compute these with <code>mean(S)</code> and <code>sd(S)</code>. The blue curve you see added to the histogram above is a normal density with this average and standard deviation.</p>
-<p>This average and this standard deviation have special names. They are referred to as the <em>expected value</em> and <em>standard error</em> of the random variable <span class="math inline">\(S\)</span>. We will say more about these in the next section.</p>
-<p>Statistical theory provides a way to derive the distribution of random variables defined as independent random draws from an urn. Specifically, in our example above, we can show that <span class="math inline">\((S+n)/2\)</span> follows a binomial distribution. We therefore do not need to run for Monte Carlo simulations to know the probability distribution of <span class="math inline">\(S\)</span>. We did this for illustrative purposes.</p>
-<p>We can use the function <code>dbinom</code> and <code>pbinom</code> to compute the probabilities exactly. For example, to compute <span class="math inline">\(\mbox{Pr}(S &lt; 0)\)</span> we note that:</p>
+<p>We see that the distribution appears to be approximately normal. A qqplot will confirm that the normal approximation is close to a perfect approximation for this distribution. In fact, if the distribution is normal, all we need to define it are the average and the standard deviation. Since we have the original values from which the distribution is created, we can easily compute these with <code>mean(S)</code> and <code>sd(S)</code>. The blue curve added to the histogram above is a normal density with this average and standard deviation.</p>
+<p>This average and this standard deviation have special names; they are referred to as the <em>expected value</em> and <em>standard error</em> of the random variable <span class="math inline">\(S\)</span>. More details on these concepts will be provided in the next section.</p>
+<p>Statistical theory offers a method to derive the distribution of random variables defined as the sum of independent random draw of numbers from an urn. Specifically, in our example above, we can demonstrate that <span class="math inline">\((S+n)/2\)</span> follows a binomial distribution. We therefore do not need to run Monte Carlo simulations to determine the probability distribution of <span class="math inline">\(S\)</span>. The simulations were conducted for illustrative purposes.</p>
+<p>We can use the function <code>dbinom</code> and <code>pbinom</code> to compute the probabilities exactly. For example, to compute <span class="math inline">\(\mbox{Pr}(S &lt; 0)\)</span>, we note that:</p>
 <p><span class="math display">\[\mbox{Pr}(S &lt; 0) = \mbox{Pr}((S+n)/2 &lt; (0+n)/2)\]</span></p>
-<p>and we can use the <code>pbinom</code> to compute <span class="math display">\[\mbox{Pr}(S \leq 0)\]</span></p>
+<p>and we can use the <code>pbinom</code> to compute <span class="math display">\[\mbox{Pr}(S \leq 0)\]</span>:</p>
 <div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-11_bd45546d26ecad71e5e486c0d6610033">
 <div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
 <span><span class="fu"><a href="https://rdrr.io/r/stats/Binomial.html">pbinom</a></span><span class="op">(</span><span class="va">n</span><span class="op">/</span><span class="fl">2</span>, size <span class="op">=</span> <span class="va">n</span>, prob <span class="op">=</span> <span class="fl">10</span><span class="op">/</span><span class="fl">19</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.0511</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Because this is a discrete probability function, to get <span class="math inline">\(\mbox{Pr}(S &lt; 0)\)</span> rather than <span class="math inline">\(\mbox{Pr}(S \leq 0)\)</span>, we write:</p>
+<p>Since this is a discrete probability function, to obtain <span class="math inline">\(\mbox{Pr}(S &lt; 0)\)</span> rather than <span class="math inline">\(\mbox{Pr}(S \leq 0)\)</span>, we write:</p>
 <div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-12_ea392be23f5d60bceae3ab2161a9f2ba">
 <div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/Binomial.html">pbinom</a></span><span class="op">(</span><span class="va">n</span><span class="op">/</span><span class="fl">2</span> <span class="op">-</span> <span class="fl">1</span>, size <span class="op">=</span> <span class="va">n</span>, prob <span class="op">=</span> <span class="fl">10</span><span class="op">/</span><span class="fl">19</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.0448</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>For the details of the binomial distribution, you can consult any basic probability book or even Wikipedia<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>.</p>
-<p>Here we do not cover these details. Instead, we will discuss an incredibly useful approximation provided by mathematical theory that applies generally to sums and averages of draws from any urn: the Central Limit Theorem (CLT).</p>
+<p>We do not delve into these details here. Instead, we will explore an incredibly useful approximation provided by mathematical theory, which generally applies to sums and averages of draws from any urn: the Central Limit Theorem (CLT).</p>
 </section><section id="distributions-versus-probability-distributions" class="level2" data-number="5.4"><h2 data-number="5.4" class="anchored" data-anchor-id="distributions-versus-probability-distributions">
 <span class="header-section-number">5.4</span> Distributions versus probability distributions</h2>
-<p>Before we continue, let’s make an important distinction and connection between the distribution of a list of numbers and a probability distribution. In the visualization chapter, we described how any list of numbers <span class="math inline">\(x_1,\dots,x_n\)</span> has a distribution. The definition is quite straightforward. We define <span class="math inline">\(F(a)\)</span> as the function that tells us what proportion of the list is less than or equal to <span class="math inline">\(a\)</span>. Because they are useful summaries when the distribution is approximately normal, we define the average and standard deviation. These are defined with a straightforward operation of the vector containing the list of numbers <code>x</code>:</p>
-<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-13_b0ebdb756e19384f4cebf38c461c6635">
+<p>Before we continue, let’s establish an important distinction and connection between the distribution of a list of numbers and a probability distribution. Any list of numbers <span class="math inline">\(x_1,\dots,x_n\)</span> has a distribution. The definition is quite straightforward. We define <span class="math inline">\(F(a)\)</span> as the function that indicates what proportion of the list is less than or equal to <span class="math inline">\(a\)</span>. Given their usefulness as summaries when the distribution is approximately normal, we also define the average and standard deviation. These are determined with a straightforward operation involving the vector containing the list of numbers, denoted as <code>x</code>:</p>
+<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-13_84f696f62b0ec49439345b1d1505333f">
 <div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">m</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span>
-<span><span class="va">s</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="op">(</span><span class="va">x</span> <span class="op">-</span> <span class="va">m</span><span class="op">)</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span> <span class="op">/</span> <span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span><span class="va">s</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="op">(</span><span class="va">x</span> <span class="op">-</span> <span class="va">m</span><span class="op">)</span><span class="op">^</span><span class="fl">2</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>A random variable <span class="math inline">\(X\)</span> has a distribution function. To define this, we do not need a list of numbers. It is a theoretical concept. In this case, we define the distribution as the <span class="math inline">\(F(a)\)</span> that answers the question: what is the probability that <span class="math inline">\(X\)</span> is less than or equal to <span class="math inline">\(a\)</span>? There is no list of numbers.</p>
-<p>However, if <span class="math inline">\(X\)</span> is defined by drawing from an urn with numbers in it, then there is a list: the list of numbers inside the urn. In this case, the distribution of that list is the probability distribution of <span class="math inline">\(X\)</span> and the average and standard deviation of that list are the expected value and standard error of the random variable.</p>
-<p>Another way to think about it that does not involve an urn is to run a Monte Carlo simulation and generate a very large list of outcomes of <span class="math inline">\(X\)</span>. These outcomes are a list of numbers. The distribution of this list will be a very good approximation of the probability distribution of <span class="math inline">\(X\)</span>. The longer the list, the better the approximation. The average and standard deviation of this list will approximate the expected value and standard error of the random variable.</p>
+<p>A random variable <span class="math inline">\(X\)</span> has a distribution function. To define this, we do not need a list of numbers; it is a theoretical concept. In this case, we define the distribution as the <span class="math inline">\(F(a)\)</span> that answers the question: What is the probability that <span class="math inline">\(X\)</span> is less than or equal to <span class="math inline">\(a\)</span>? There is no list of numbers.</p>
+<p>However, if <span class="math inline">\(X\)</span> is defined by drawing from an urn containing numbers, then there exists a list: the list of numbers inside the urn. In this case, the distribution of that list is the probability distribution of <span class="math inline">\(X\)</span>, and the average and standard deviation of that list are the expected value and standard error of the random variable.</p>
+<p>Another way to think about it without involving an urn is by running a Monte Carlo simulation and generating a very large list of outcomes of <span class="math inline">\(X\)</span>. These outcomes form a list of numbers, and the distribution of this list will be a very good approximation of the probability distribution of <span class="math inline">\(X\)</span>. The longer the list, the better the approximation. The average and standard deviation of this list will approximate the expected value and standard error of the random variable.</p>
 </section><section id="notation-for-random-variables" class="level2" data-number="5.5"><h2 data-number="5.5" class="anchored" data-anchor-id="notation-for-random-variables">
 <span class="header-section-number">5.5</span> Notation for random variables</h2>
-<p>In statistical textbooks, upper case letters are used to denote random variables and we follow this convention here. Lower case letters are used for observed values. You will see some notation that includes both. For example, you will see events defined as <span class="math inline">\(X \leq x\)</span>. Here <span class="math inline">\(X\)</span> is a random variable, making it a random event, and <span class="math inline">\(x\)</span> is an arbitrary value and not random. So, for example, <span class="math inline">\(X\)</span> might represent the number on a die roll and <span class="math inline">\(x\)</span> will represent an actual value we see 1, 2, 3, 4, 5, or 6. So in this case, the probability of <span class="math inline">\(X=x\)</span> is 1/6 regardless of the observed value <span class="math inline">\(x\)</span>. This notation is a bit strange because, when we ask questions about probability, <span class="math inline">\(X\)</span> is not an observed quantity. Instead, it’s a random quantity that we will see in the future. We can talk about what we expect it to be, what values are probable, but not what it is. But once we have data, we do see a realization of <span class="math inline">\(X\)</span>. So data scientists talk of what could have been after we see what actually happened.</p>
+<p>In statistical textbooks, upper case letters denote random variables, and we will adhere to this convention. Lower case letters are used for observed values. You will see some notation that include both. For example, you will see events defined as <span class="math inline">\(X \leq x\)</span>. Here <span class="math inline">\(X\)</span> is a random variable and <span class="math inline">\(x\)</span> is an arbitrary value and not random. So, for example, <span class="math inline">\(X\)</span> might represent the number on a die roll and <span class="math inline">\(x\)</span> will represent an actual value we see: 1, 2, 3, 4, 5, or 6. In this case, the probability of <span class="math inline">\(X=x\)</span> is 1/6 regardless of the observed value <span class="math inline">\(x\)</span>.</p>
+<p>This notation may seem a bit strange because when we inquire about probability, <span class="math inline">\(X\)</span> is not an observed quantity; it’s a random quantity that we will encounter in the future. We can discuss what we expect <span class="math inline">\(X\)</span> to be, what values are probable, but we can’t discuss what value <span class="math inline">\(X\)</span> is. Once we have data, we do see a realization of <span class="math inline">\(X\)</span>. Therefore, data analysits often speak of what could have been after observing what actually happened.</p>
 </section><section id="the-expected-value-and-standard-error" class="level2" data-number="5.6"><h2 data-number="5.6" class="anchored" data-anchor-id="the-expected-value-and-standard-error">
 <span class="header-section-number">5.6</span> The expected value and standard error</h2>
-<p>We have described sampling models for draws. We will now go over the mathematical theory that lets us approximate the probability distributions for the sum of draws. Once we do this, we will be able to help the casino predict how much money they will make. The same approach we use for the sum of draws will be useful for describing the distribution of averages and proportion which we will need to understand how polls work.</p>
-<p>The first important concept to learn is the <em>expected value</em>. In statistics books, it is common to use letter <span class="math inline">\(\mbox{E}\)</span> like this:</p>
+<p>We have described sampling models for draws. We will now review the mathematical theory that allows us to approximate the probability distributions for the sum of draws. Once we do this, we will be able to help the casino predict how much money they will make. The same approach we use for the sum of draws will be useful for describing the distribution of averages and proportion, which we will need to understand how polls work.</p>
+<p>The first important concept to learn is the <em>expected value</em>. In statistics books, it is common to represent it with the letter <span class="math inline">\(\mbox{E}\)</span> like this:</p>
 <p><span class="math display">\[\mbox{E}[X]\]</span></p>
 <p>to denote the expected value of the random variable <span class="math inline">\(X\)</span>.</p>
-<p>A random variable will vary around its expected value in a way that if you take the average of many, many draws, the average of the draws will approximate the expected value, getting closer and closer the more draws you take. This makes the expected value a useful quantity to compute.</p>
-<p>For discrete random variable with possible outcomes <span class="math inline">\(x_1,\dots,x_n\)</span> the expected value is defined as</p>
+<p>A random variable will vary around its expected value in a manner that if you take the average of many, many draws, the average will approximate the expected value. This approximation improves as you take more draws, making the expected value a useful quantity to compute.</p>
+<p>For discrete random variable with possible outcomes <span class="math inline">\(x_1,\dots,x_n\)</span>, the expected value is defined as:</p>
 <p><span class="math display">\[
 \mbox{E}[X] = \sum_{i=1}^n x_i \,\mbox{Pr}(X = x_i)
-\]</span> If <span class="math inline">\(X\)</span> is a continuous random variable, with range of values <span class="math inline">\(a\)</span> to <span class="math inline">\(b\)</span> and probability density function <span class="math inline">\(f(x)\)</span>, this sum turns into an integral:</p>
+\]</span> If <span class="math inline">\(X\)</span> is a continuous random variable with a range of values <span class="math inline">\(a\)</span> to <span class="math inline">\(b\)</span> and a probability density function <span class="math inline">\(f(x)\)</span>, this sum transforms into an integral:</p>
 <p><span class="math display">\[
 \mbox{E}[X] = \int_a^b x f(x)
 \]</span></p>
-<p>Note that in the case that we are picking values from an un urn where each value <span class="math inline">\(x_i\)</span> has an equal chance <span class="math inline">\(1/n\)</span> of being selected the above equation is simply the average of the <span class="math inline">\(x_i\)</span>s</p>
+<p>Note that in the case that we are picking values from an urn, and each value <span class="math inline">\(x_i\)</span> has an equal chance <span class="math inline">\(1/n\)</span> of being selected, the above equation is simply the average of the <span class="math inline">\(x_i\)</span>s.</p>
 <p><span class="math display">\[
 \mbox{E}[X] = \frac{1}{n}\sum_{i=1}^n x_i
 \]</span></p>
-<p>In the urn used to model betting on red in roulette, we have 20 one dollars and 18 negative one dollars so the expected value is:</p>
+<p>In the urn used to model betting on red in roulette, we have 20 one-dollar bills and 18 negative one-dollar bills, so the expected value is:</p>
 <p><span class="math display">\[
 \mbox{E}[X] = (20 + -18)/38
 \]</span></p>
-<p>which is about 5 cents. You may think it is a bit counter-intuitive to say that <span class="math inline">\(X\)</span> varies around 0.05, when the only values it takes is 1 and -1. One way to make sense of the expected value in this context is by realizing that if we play the game over and over, the casino wins, on average, 5 cents per game. A Monte Carlo simulation confirms this:</p>
+<p>which is about 5 cents. You might consider it a bit counterintuitive to say that <span class="math inline">\(X\)</span> varies around 0.05 when it only takes the values 1 and -1. One way to make sense of the expected value in this context is by realizing that, if we play the game over and over, the casino wins, on average, 5 cents per game. A Monte Carlo simulation confirms this:</p>
 <div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-14_5c9f89c2cf4f7ab83bda76f48ccf43d2">
 <div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10</span><span class="op">^</span><span class="fl">6</span></span>
 <span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>, <span class="fl">1</span><span class="op">)</span>, <span class="va">B</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">9</span><span class="op">/</span><span class="fl">19</span>, <span class="fl">10</span><span class="op">/</span><span class="fl">19</span><span class="op">)</span><span class="op">)</span></span>
@@ -576,12 +583,12 @@ <h1 class="title">
 </div>
 <p>In general, if the urn has two possible outcomes, say <span class="math inline">\(a\)</span> and <span class="math inline">\(b\)</span>, with proportions <span class="math inline">\(p\)</span> and <span class="math inline">\(1-p\)</span> respectively, the average is:</p>
 <p><span class="math display">\[\mbox{E}[X] = ap + b(1-p)\]</span></p>
-<p>To see this, notice that if there are <span class="math inline">\(n\)</span> beads in the urn, then we have <span class="math inline">\(np\)</span> <span class="math inline">\(a\)</span>s and <span class="math inline">\(n(1-p)\)</span> <span class="math inline">\(b\)</span>s and because the average is the sum, <span class="math inline">\(n\times a \times p + n\times b \times (1-p)\)</span>, divided by the total <span class="math inline">\(n\)</span>, we get that the average is <span class="math inline">\(ap + b(1-p)\)</span>.</p>
-<p>Now the reason we define the expected value is because this mathematical definition turns out to be useful for approximating the probability distributions of sum, which then is useful for describing the distribution of averages and proportions. The first useful fact is that the <em>expected value of the sum of the draws</em> is the number of draws <span class="math inline">\(\times\)</span> the average of the numbers in the urn.</p>
-<p>So if 1,000 people play roulette, the casino expects to win, on average, about 1,000 <span class="math inline">\(\times\)</span> $0.05 = $50. But this is an expected value. How different can one observation be from the expected value? The casino really needs to know this. What is the range of possibilities? If negative numbers are too likely, they will not install roulette wheels. Statistical theory once again answers this question. The <em>standard error</em> (SE) gives us an idea of the size of the variation around the expected value. In statistics books, it’s common to use:</p>
+<p>To confirm this, observe that if there are <span class="math inline">\(n\)</span> beads in the urn, then we have <span class="math inline">\(np\)</span> <span class="math inline">\(a\)</span>s and <span class="math inline">\(n(1-p)\)</span> <span class="math inline">\(b\)</span>s, and because the average is the sum, <span class="math inline">\(n\times a \times p + n\times b \times (1-p)\)</span>, divided by the total <span class="math inline">\(n\)</span>, we get that the average is <span class="math inline">\(ap + b(1-p)\)</span>.</p>
+<p>Now, the reason we define the expected value is because this mathematical definition turns out to be useful for approximating the probability distributions of sum. This, in turn, is useful for describing the distribution of averages and proportions. The first useful fact is that the <em>expected value of the sum of the draws</em> is the number of draws <span class="math inline">\(\times\)</span> the average of the numbers in the urn.</p>
+<p>Therefore, if 1,000 people play roulette, the casino expects to win, on average, about 1,000 <span class="math inline">\(\times\)</span> $0.05 = $50. However, this is an expected value. How different can one observation be from the expected value? The casino really needs to know this. What is the range of possibilities? If negative numbers are too likely, they will not install roulette wheels. Statistical theory once again answers this question. The <em>standard error</em> (SE) gives us an idea of the size of the variation around the expected value. In statistics books, it’s common to use:</p>
 <p><span class="math display">\[\mbox{SE}[X]\]</span></p>
 <p>to denote the standard error of a random variable.</p>
-<p>For discrete random variable with possible outcomes <span class="math inline">\(x_1,\dots,x_n\)</span> the standard error is defined as</p>
+<p>For discrete random variable with possible outcomes <span class="math inline">\(x_1,\dots,x_n\)</span>, the standard error is defined as:</p>
 <p><span class="math display">\[
 \mbox{SE}[X] = \sqrt{\sum_{i=1}^n \left(x_i - E[X]\right)^2 \,\mbox{Pr}(X = x_i)},
 \]</span> which you can think of as the expected <em>average</em> distance of <span class="math inline">\(X\)</span> from the expected value.</p>
@@ -589,48 +596,48 @@ <h1 class="title">
 <p><span class="math display">\[
 \mbox{SE}[X] = \sqrt{\int_a^b \left(x-\mbox{E}[X]\right)^2 f(x)\,\mathrm{d}x}
 \]</span></p>
-<p>Note that in the case that we are picking values from an un urn where each value <span class="math inline">\(x_i\)</span> has an equal chance <span class="math inline">\(1/n\)</span> of being selected the above equation is simply the standard deviation of of the <span class="math inline">\(x_i\)</span>s</p>
+<p>Note that in the case that we are picking values from an un urn where each value <span class="math inline">\(x_i\)</span> has an equal chance <span class="math inline">\(1/n\)</span> of being selected, the above equation is simply the standard deviation of of the <span class="math inline">\(x_i\)</span>s.</p>
 <p><span class="math display">\[
 \mbox{SE}[X] = \sqrt{\frac{1}{n}\sum_{i=1}^n (x_i - \bar{x})^2} \mbox{ with } \bar{x} =  \frac{1}{n}\sum_{i=1}^n x_i
 \]</span> Using the definition of standard deviation, we can derive, with a bit of math, that if an urn contains two values <span class="math inline">\(a\)</span> and <span class="math inline">\(b\)</span> with proportions <span class="math inline">\(p\)</span> and <span class="math inline">\((1-p)\)</span>, respectively, the standard deviation is:</p>
 <p><span class="math display">\[\mid b - a \mid \sqrt{p(1-p)}.\]</span></p>
-<p>So in our roulette example, the standard deviation of the values inside the urn is: <span class="math inline">\(\mid 1 - (-1) \mid \sqrt{10/19 \times 9/19}\)</span> or:</p>
-<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-15_d4d8cdd5e9a0753c67c319a47293ab74">
-<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fl">2</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">90</span><span class="op">)</span><span class="op">/</span><span class="fl">19</span></span>
+<p>So in our roulette example, the standard deviation of the values inside the urn is <span class="math inline">\(\mid 1 - (-1) \mid \sqrt{10/19 \times 9/19}\)</span> or:</p>
+<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-15_41b23484560c8f2e830cf87586a6578f">
+<div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fl">2</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">90</span><span class="op">)</span><span class="op">/</span><span class="fl">19</span></span>
 <span><span class="co">#&gt; [1] 0.999</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The standard error tells us the typical difference between a random variable and its expectation. Since one draw is obviously the sum of just one draw, we can use the formula above to calculate that the random variable defined by one draw has an expected value of 0.05 and a standard error of about 1. This makes sense since we either get 1 or -1, with 1 slightly favored over -1.</p>
-<p>A widely used mathematical results is tha <strong>if our draws are independent</strong>, then the <em>standard error of the sum</em> is given by the equation:</p>
+<p>The standard error tells us the typical difference between a random variable and its expectation. Since one draw is obviously the sum of just one draw, we can use the formula above to calculate that the random variable defined by one draw has an expected value of 0.05 and a standard error of about 1. This makes sense since we obtain either 1 or -1, with 1 slightly favored over -1.</p>
+<p>A widely used mathematical result is that <strong>if our draws are independent</strong>, then the <em>standard error of the sum</em> is given by the equation:</p>
 <p><span class="math display">\[
 \sqrt{\mbox{number of draws}} \times \mbox{ standard deviation of the numbers in the urn}
 \]</span></p>
 <p>Using this formula, the sum of 1,000 people playing has standard error of about $32:</p>
-<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-16_00b47b7a095996970e7bb30ab9697107">
+<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-16_fbbb816c51a199c064aeab086149e2f7">
 <div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
-<span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">n</span><span class="op">)</span> <span class="op">*</span> <span class="fl">2</span> <span class="op">*</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">90</span><span class="op">)</span><span class="op">/</span><span class="fl">19</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">n</span><span class="op">)</span><span class="op">*</span><span class="fl">2</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">90</span><span class="op">)</span><span class="op">/</span><span class="fl">19</span></span>
 <span><span class="co">#&gt; [1] 31.6</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>As a result, when 1,000 people bet on red, the casino is expected to win $50 with a standard error of $32. It therefore seems like a safe bet. But we still haven’t answered the question: how likely is it to lose money? Here the CLT will help.</p>
+<p>As a result, when 1,000 people bet on red, the casino is expected to win $50 with a standard error of $32. It therefore seems like a safe bet to install more roulette wheels. But we still haven’t answered the question: How likely is the casino to lose money? The CLT will help in this regard.</p>
 <div class="callout callout-style-simple callout-note">
 <div class="callout-body d-flex">
 <div class="callout-icon-container">
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>The exact probability for the casino winnings can be computed exactly, rather than an approximation, using the binomial distribution. However, here we focus on the CLT, which can be generally applied to sums of random variables in a way that the binomial distribution can’t.</p>
+<p>The exact probability for the casino winnings can be computed precisely, rather than approximately, using the binomial distribution. However, here we focus on the CLT, which can be applied more broadly to sums of random variables in a way that the binomial distribution cannot.</p>
 </div>
 </div>
 </div>
 </section><section id="central-limit-theorem" class="level2" data-number="5.7"><h2 data-number="5.7" class="anchored" data-anchor-id="central-limit-theorem">
 <span class="header-section-number">5.7</span> Central Limit Theorem</h2>
-<p>The Central Limit Theorem (CLT) tells us that when the number of draws, also called the <em>sample size</em>, is large, the probability distribution of the sum of the independent draws is approximately normal. Because sampling models are used for so many data generation processes, the CLT is considered one of the most important mathematical insights in history.</p>
+<p>The Central Limit Theorem (CLT) tells us that when the number of draws, also called the <em>sample size</em>, is large, the probability distribution of the sum of the independent draws is approximately normal. Given that sampling models are used for so many data generation processes, the CLT is considered one of the most important mathematical insights in history.</p>
 <p>Previously, we discussed that if we know that the distribution of a list of numbers is approximated by the normal distribution, all we need to describe the list are the average and standard deviation. We also know that the same applies to probability distributions. If a random variable has a probability distribution that is approximated with the normal distribution, then all we need to describe the probability distribution are the average and standard deviation, referred to as the expected value and standard error.</p>
 <p>We previously ran this Monte Carlo simulation:</p>
-<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-17_ce618b17c01d49b2217a0604d7989bf4">
+<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-17_8100712df9ef97fbab3b5864c6761536">
 <div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">n</span> <span class="op">&lt;-</span> <span class="fl">1000</span></span>
 <span><span class="va">B</span> <span class="op">&lt;-</span> <span class="fl">10000</span></span>
 <span><span class="va">roulette_winnings</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">n</span><span class="op">)</span><span class="op">{</span></span>
-<span>  <span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>,<span class="fl">1</span><span class="op">)</span>, <span class="va">n</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">9</span><span class="op">/</span><span class="fl">19</span>, <span class="fl">10</span><span class="op">/</span><span class="fl">19</span><span class="op">)</span><span class="op">)</span></span>
+<span>  <span class="va">X</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1</span>, <span class="fl">1</span><span class="op">)</span>, <span class="va">n</span>, replace <span class="op">=</span> <span class="cn">TRUE</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">9</span><span class="op">/</span><span class="fl">19</span>, <span class="fl">10</span><span class="op">/</span><span class="fl">19</span><span class="op">)</span><span class="op">)</span></span>
 <span>  <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">X</span><span class="op">)</span></span>
 <span><span class="op">}</span></span>
 <span><span class="va">S</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">replicate</a></span><span class="op">(</span><span class="va">B</span>, <span class="fu">roulette_winnings</span><span class="op">(</span><span class="va">n</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -650,9 +657,9 @@ <h1 class="title">
 <span><span class="co">#&gt; [1] 31.7</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Using the CLT, we can skip the Monte Carlo simulation and instead compute the probability of the casino losing money using this approximation:</p>
-<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-20_60f32400eabb7a9222c05319e26b22ff">
-<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mu</span> <span class="op">&lt;-</span> <span class="va">n</span> <span class="op">*</span> <span class="op">(</span><span class="fl">20</span> <span class="op">-</span> <span class="fl">18</span><span class="op">)</span><span class="op">/</span><span class="fl">38</span></span>
-<span><span class="va">se</span> <span class="op">&lt;-</span>  <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">n</span><span class="op">)</span><span class="op">*</span><span class="fl">2</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">90</span><span class="op">)</span><span class="op">/</span><span class="fl">19</span> </span>
+<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-20_8e7a44d44e67e2e8eb3aa322a30c1d6e">
+<div class="sourceCode" id="cb19"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">mu</span> <span class="op">&lt;-</span> <span class="va">n</span><span class="op">*</span><span class="op">(</span><span class="fl">20</span> <span class="op">-</span> <span class="fl">18</span><span class="op">)</span><span class="op">/</span><span class="fl">38</span></span>
+<span><span class="va">se</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="va">n</span><span class="op">)</span><span class="op">*</span><span class="fl">2</span><span class="op">*</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">sqrt</a></span><span class="op">(</span><span class="fl">90</span><span class="op">)</span><span class="op">/</span><span class="fl">19</span> </span>
 <span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="fl">0</span>, <span class="va">mu</span>, <span class="va">se</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.0478</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
@@ -663,9 +670,9 @@ <h1 class="title">
 </div>
 <section id="how-large-is-large-in-the-central-limit-theorem" class="level3" data-number="5.7.1"><h3 data-number="5.7.1" class="anchored" data-anchor-id="how-large-is-large-in-the-central-limit-theorem">
 <span class="header-section-number">5.7.1</span> How large is large in the Central Limit Theorem?</h3>
-<p>The CLT works when the number of draws is large. But large is a relative term. In many circumstances as few as 30 draws is enough to make the CLT useful. In some specific instances, as few as 10 is enough. However, these should not be considered general rules. Note, for example, that when the probability of success is very small, we need much larger sample sizes.</p>
+<p>The CLT works when the number of draws is large, but “large” is a relative term. In many circumstances, as few as 30 draws is enough to make the CLT useful. In some specific instances, as few as 10 is enough. However, these should not be considered general rules. Note, for example, that when the probability of success is very small, much larger sample sizes are needed.</p>
 <p>By way of illustration, let’s consider the lottery. In the lottery, the chances of winning are less than 1 in a million. Thousands of people play so the number of draws is very large. Yet the number of winners, the sum of the draws, range between 0 and 4. This sum is certainly not well approximated by a normal distribution, so the CLT does not apply, even with the very large sample size. This is generally true when the probability of a success is very low. In these cases, the Poisson distribution is more appropriate.</p>
-<p>You can examine the properties of the Poisson distribution using <code>dpois</code> and <code>ppois</code>. You can generate random variables following this distribution with <code>rpois</code>. However, we do not cover the theory here. You can learn about the Poisson distribution in any probability textbook and even Wikipedia<a href="#fn4" class="footnote-ref" id="fnref4" role="doc-noteref"><sup>4</sup></a></p>
+<p>You can explore the properties of the Poisson distribution using <code>dpois</code> and <code>ppois</code>. You can generate random variables following this distribution with <code>rpois</code>. However, we won’t cover the theory here. You can learn about the Poisson distribution in any probability textbook and even Wikipedia<a href="#fn4" class="footnote-ref" id="fnref4" role="doc-noteref"><sup>4</sup></a>.</p>
 </section></section><section id="statistical-properties-of-averages" class="level2" data-number="5.8"><h2 data-number="5.8" class="anchored" data-anchor-id="statistical-properties-of-averages">
 <span class="header-section-number">5.8</span> Statistical properties of averages</h2>
 <p>There are several useful mathematical results that we used above and often employ when working with data. We list them below.</p>
@@ -673,7 +680,7 @@ <h1 class="title">
 <p><span class="math display">\[
 \mbox{E}[X_1+X_2+\dots+X_n] =  \mbox{E}[X_1] + \mbox{E}[X_2]+\dots+\mbox{E}[X_n]
 \]</span></p>
-<p>If the <span class="math inline">\(X\)</span> are independent draws from the urn, then they all have the same expected value. Let’s call it <span class="math inline">\(\mu\)</span> and thus:</p>
+<p>If <span class="math inline">\(X\)</span> represents independent draws from the urn, then they all have the same expected value. Let’s denote the expected value with <span class="math inline">\(\mu\)</span> and rewrite the equation as:</p>
 <p><span class="math display">\[
 \mbox{E}[X_1+X_2+\dots+X_n]=  n\mu
 \]</span></p>
@@ -682,7 +689,7 @@ <h1 class="title">
 <p><span class="math display">\[
 \mbox{E}[aX] =  a\times\mbox{E}[X]
 \]</span></p>
-<p>To see why this is intuitive, consider change of units. If we change the units of a random variable, say from dollars to cents, the expectation should change in the same way. A consequence of the above two facts is that the expected value of the average of independent draws from the same urn is the expected value of the urn, call it <span class="math inline">\(\mu\)</span> again:</p>
+<p>To understand why this is intuitive, consider changing units. If we change the units of a random variable, such as from dollars to cents, the expectation should change in the same way. A consequence of the above two facts is that the expected value of the average of independent draws from the same urn is the expected value of the urn, denoted as <span class="math inline">\(\mu\)</span> again:</p>
 <p><span class="math display">\[
 \mbox{E}[(X_1+X_2+\dots+X_n) / n]=   \mbox{E}[X_1+X_2+\dots+X_n] / n = n\mu/n = \mu
 \]</span></p>
@@ -720,7 +727,7 @@ <h1 class="title">
 <div class="callout-body-container callout-body">
 <p>Let’s make the explanation more concise and clear:</p>
 <p>The given equation reveals crucial insights for practical scenarios. Specifically, it suggests that the standard error can be minimized by increasing the sample size, <span class="math inline">\(n\)</span>, and we can quantify this reduction. However, this principle holds true only when the variables <span class="math inline">\(X_1, X_2, ... X_n\)</span> are independent. If they are not, the estimated standard error can be significantly off.</p>
-<p>In Section <a href="../linear-models/regression.html#sec-corr-coef"><span>Section&nbsp;13.2</span></a>, we introduce the concept of correlation, which quantifies the degree to which variables are interdependent. If the correlation coefficient among the ( X ) variables is ( ), the standard error of their average is:</p>
+<p>In <a href="../linear-models/regression.html#sec-corr-coef"><span>Section&nbsp;14.2</span></a>, we introduce the concept of correlation, which quantifies the degree to which variables are interdependent. If the correlation coefficient among the ( X ) variables is ( ), the standard error of their average is:</p>
 <p><span class="math display">\[
 \mbox{SE}\left(\bar{X}\right) = \sigma \sqrt{\frac{1 + (n-1) \rho}{n}}
 \]</span></p>
@@ -729,7 +736,7 @@ <h1 class="title">
 </div>
 <section id="law-of-large-numbers" class="level3" data-number="5.8.1"><h3 data-number="5.8.1" class="anchored" data-anchor-id="law-of-large-numbers">
 <span class="header-section-number">5.8.1</span> Law of large numbers</h3>
-<p>An important implication of the result 5 above is that the standard error of the average becomes smaller and smaller as <span class="math inline">\(n\)</span> grows larger. When <span class="math inline">\(n\)</span> is very large, then the standard error is practically 0 and the average of the draws converges to the average of the urn. This is known in statistical textbooks as the law of large numbers or the law of averages.</p>
+<p>An important implication of result 5 above is that the standard error of the average becomes smaller and smaller as <span class="math inline">\(n\)</span> grows larger. When <span class="math inline">\(n\)</span> is very large, then the standard error is practically 0 and the average of the draws converges to the average of the urn. This is known in statistical textbooks as the law of large numbers or the law of averages.</p>
 <div class="callout callout-style-simple callout-warning callout-titled" title="Misinterpretation of the law of averages">
 <div class="callout-header d-flex align-content-center">
 <div class="callout-icon-container">
@@ -740,22 +747,22 @@ <h1 class="title">
 </div>
 </div>
 <div class="callout-body-container callout-body">
-<p>The law of averages is sometimes misinterpreted. For example, if you toss a coin 5 times and see a head each time, you might hear someone argue that the next toss is probably a tail because of the law of averages: on average we should see 50% heads and 50% tails. A similar argument would be to say that red “is due” on the roulette wheel after seeing black come up five times in a row. These events are independent so the chance of a coin landing heads is 50% regardless of the previous 5. This is also the case for the roulette outcome. The law of averages applies only when the number of draws is very large and not in small samples. After a million tosses, you will definitely see about 50% heads regardless of the outcome of the first five tosses. Another funny misuse of the law of averages is in sports when TV sportscasters predict a player is about to succeed because they have failed a few times in a row.</p>
+<p>The law of averages is sometimes misinterpreted. For example, if you toss a coin 5 times and see a head each time, you might hear someone argue that the next toss is probably a tail because of the law of averages: on average we should see 50% heads and 50% tails. A similar argument would be to say that red “is due” on the roulette wheel after seeing black come up five times in a row. Yet these events are independent so the chance of a coin landing heads is 50%, regardless of the previous 5. The same principle applies to the roulette outcome. The law of averages applies only when the number of draws is very large and not in small samples. After a million tosses, you will definitely see about 50% heads regardless of the outcome of the first five tosses. Another funny misuse of the law of averages is in sports when TV sportscasters predict a player is about to succeed because they have failed a few times in a row.</p>
 </div>
 </div>
 </section></section><section id="exercises" class="level2" data-number="5.9"><h2 data-number="5.9" class="anchored" data-anchor-id="exercises">
 <span class="header-section-number">5.9</span> Exercises</h2>
-<p>1. In American Roulette you can also bet on green. There are 18 reds, 18 blacks and 2 greens (0 and 00). What are the chances the green comes out?</p>
-<p>2. The payout for winning on green is $17 dollars. This means that if you bet a dollar and it lands on green, you get $17. Create a sampling model using sample to simulate the random variable <span class="math inline">\(X\)</span> for your winnings. Hint: see the example below for how it should look like when betting on red.</p>
-<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-22_dc6ad201a73d9b00119e2e76e862d678">
-<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span>,<span class="op">-</span><span class="fl">1</span><span class="op">)</span>, <span class="fl">1</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">9</span><span class="op">/</span><span class="fl">19</span>, <span class="fl">10</span><span class="op">/</span><span class="fl">19</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>1. In American Roulette, you can also bet on green. There are 18 reds, 18 blacks and 2 greens (0 and 00). What are the chances the green comes out?</p>
+<p>2. The payout for winning on green is $17 dollars. This means that if you bet a dollar and it lands on green, you get $17. Create a sampling model using sample to simulate the random variable <span class="math inline">\(X\)</span> for your winnings. Hint: Refer to the example below for how it should look like when betting on red.</p>
+<div class="cell" data-layout-align="center" data-hash="random-variables-sampling-models-clt_cache/html/unnamed-chunk-22_e5f32195ea257197c08452d70325bd02">
+<div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">x</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sample.html">sample</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">1</span>, <span class="op">-</span><span class="fl">1</span><span class="op">)</span>, <span class="fl">1</span>, prob <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">9</span><span class="op">/</span><span class="fl">19</span>, <span class="fl">10</span><span class="op">/</span><span class="fl">19</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>3. Compute the expected value of <span class="math inline">\(X\)</span>.</p>
 <p>4. Compute the standard error of <span class="math inline">\(X\)</span>.</p>
-<p>5. Now create a random variable <span class="math inline">\(S\)</span> that is the sum of your winnings after betting on green 1000 times. Hint: change the argument <code>size</code> and <code>replace</code> in your answer to question 2. Start your code by setting the seed to 1 with <code>set.seed(1)</code>.</p>
+<p>5. Now create a random variable <span class="math inline">\(S\)</span> that is the sum of your winnings after betting on green 1000 times. Hint: change the argument <code>size</code> and <code>replace</code> in your answer to exercise 2. Start your code by setting the seed to 1 with <code>set.seed(1)</code>.</p>
 <p>6. What is the expected value of <span class="math inline">\(S\)</span>?</p>
 <p>7. What is the standard error of <span class="math inline">\(S\)</span>?</p>
-<p>8. What is the probability that you end up winning money? Hint: use the CLT.</p>
+<p>8. What is the probability that you end up winning money? Hint: Use the CLT.</p>
 <p>9. Create a Monte Carlo simulation that generates 1,000 outcomes of <span class="math inline">\(S\)</span>. Compute the average and standard deviation of the resulting list to confirm the results of 6 and 7. Start your code by setting the seed to 1 with <code>set.seed(1)</code>.</p>
 <p>10. Now check your answer to 8 using the Monte Carlo result.</p>
 <p>11. The Monte Carlo result and the CLT approximation are close, but not that close. What could account for this?</p>
@@ -765,10 +772,10 @@ <h1 class="title">
 <li>The difference is within rounding error.</li>
 <li>The CLT only works for averages.</li>
 </ol>
-<p>12. Now create a random variable <span class="math inline">\(Y\)</span> that is your average winnings per bet after playing off your winnings after betting on green 1,000 times.</p>
+<p>12. Now create a random variable <span class="math inline">\(Y\)</span> that is your average winnings per bet, after playing off your winnings after betting on green 1,000 times.</p>
 <p>13. What is the expected value of <span class="math inline">\(Y\)</span>?</p>
 <p>14. What is the standard error of <span class="math inline">\(Y\)</span>?</p>
-<p>15. What is the probability that you end up with winnings per game that are positive? Hint: use the CLT.</p>
+<p>15. What is the probability that you end up with winnings per game that are positive? Hint: Use the CLT.</p>
 <p>16. Create a Monte Carlo simulation that generates 2,500 outcomes of <span class="math inline">\(Y\)</span>. Compute the average and standard deviation of the resulting list to confirm the results of 13 and 14. Start your code by setting the seed to 1 with <code>set.seed(1)</code>.</p>
 <p>17. Now compare your answer to 15 using the Monte Carlo result.</p>
 <p>18. The Monte Carlo result and the CLT approximation are now much closer. What could account for this?</p>
@@ -778,22 +785,22 @@ <h1 class="title">
 <li>The CLT works better when the sample size is larger. We increased from 1,000 to 2,500.</li>
 <li>It is not closer. The difference is within rounding error.</li>
 </ol>
-<p>19. More complex versions of the sampling models we have discussed are also used by banks to decide interest rates and insurance companies to decide on premiums. To understand this, suppose you run a small bank that has a history of identifying potential homeowners that can be trusted to make payments. In fact, historically, in a given year, only 2% of your customers default, meaning that they don’t pay back the money that you lent them. Suppose your bank will give out $n=$1,000 loans for $180,000 this year. Also, after adding up all costs, suppose your bank loses <span class="math inline">\(l\)</span>=$200,000 per foreclosure. For simplicity, we assume this includes all operational costs. What is the expected profit <span class="math inline">\(S\)</span> for you bank under this scenario?</p>
-<p>20. Note that the total loss defined by the final sum in the previous exercise is a random variable. Every time you run the sampling model code, you get a different number of people default resulting in a different loss. Code a sampling model for the random variable representing your banks profit <span class="math inline">\(S\)</span> under scenario described in 19.</p>
-<p>21. The previous exercise demonstrates that if you simply loan money to everybody without interest, you will end up losing money due to the 2% that defaults. Although you know 2% of your clients will probably default, you don’t know which ones, so you can’t remove them. Yet by charging everybody just a bit extra in interest, you can make up the losses incurred due to that 2% and also cover your operating costs. What quantity <span class="math inline">\(x\)</span> would you have to charge each borrower so that your bank’s expected profit is 0? Assume that you don’t get <span class="math inline">\(x\)</span> from the borrowers that default. Also note <span class="math inline">\(x\)</span> is not the interest rate but the total you add. Can we refer to <span class="math inline">\(x\)</span> divided by the size (<span class="math inline">\(x/180000\)</span>) as the <em>interest rate</em>.</p>
-<p>22. Rewrite the sample modelfrom 20 and run a Monte Carlo simulation to get an idea of the distribution of your profit when you charge interest rates.</p>
-<p>23.&nbsp;We don’t really need a Monte Carlo simulation though. Using what we have learned, the CLT tells us that because our losses are a sum of independent draws, its distribution is approximately normal. What are the expected value and standard errors of the profit <span class="math inline">\(S\)</span>? Write these as functions of the probability of foreclosure <span class="math inline">\(p\)</span>, the number of loans <span class="math inline">\(n\)</span>, the loss per foreclosure <span class="math inline">\(l\)</span>, and the quantity you charge each borrower <span class="math inline">\(x\)</span>.</p>
+<p>19. More complex versions of the sampling models we have discussed are also used by banks to determine interest rates and insurance companies to determine premiums. To understand this, suppose you run a small bank that has a history of identifying potential homeowners that can be trusted to make payments. In fact, historically, only 2% of your customers default in a given year, meaning that they don’t pay back the money that you lent them. Suppose your bank will give out $n=$1,000 loans for $180,000 this year. Also, after adding up all costs, suppose your bank loses <span class="math inline">\(l\)</span>=$200,000 per foreclosure. For simplicity, we assume this includes all operational costs. What is the expected profit <span class="math inline">\(S\)</span> for you bank under this scenario?</p>
+<p>20. Note that the total loss defined by the final sum in the previous exercise is a random variable. Every time you run the sampling model code, you obtain a different number of people defaulting which results in a different loss. Code a sampling model for the random variable representing your banks profit <span class="math inline">\(S\)</span> under scenario described in 19.</p>
+<p>21. The previous exercise demonstrates that if you simply loan money to everybody without interest, you will end up losing money due to the 2% that defaults. Although you know 2% of your clients will probably default, you don’t know which ones, so you can’t remove them. Yet by charging everybody just a bit extra in interest, you can make up the losses incurred due to that 2%, and also cover your operating costs. What quantity <span class="math inline">\(x\)</span> would you have to charge each borrower so that your bank’s expected profit is 0? Assume that you don’t get <span class="math inline">\(x\)</span> from the borrowers that default. Also, note <span class="math inline">\(x\)</span> is not the interest rate, but the total you add meaning <span class="math inline">\(x/180000\)</span> is the <em>interest rate</em>.</p>
+<p>22. Rewrite the sample model from exercise 20 and run a Monte Carlo simulation to get an idea of the distribution of your profit when you charge interest rates.</p>
+<p>23.&nbsp;We don’t actually need a Monte Carlo simulation. Based on what we have learned, the CLT informs us that, since our losses are a sum of independent draws, its distribution is approximately normal. What are the expected value and standard errors of the profit <span class="math inline">\(S\)</span>? Write these as functions of the probability of foreclosure <span class="math inline">\(p\)</span>, the number of loans <span class="math inline">\(n\)</span>, the loss per foreclosure <span class="math inline">\(l\)</span>, and the quantity you charge each borrower <span class="math inline">\(x\)</span>.</p>
 <p>24. If you set <span class="math inline">\(x\)</span> to assure your bank breaks even (expected profit is 0), what is the probability that your bank loses money?</p>
-<p>25. Suppose that if your bank has negative profit it has to close. You therefore need to increase <span class="math inline">\(x\)</span> to minimize this risk. However, if you set the interest rates too high, your clients will go to another bank. So let’s say that we want our chances of losing money to be 1 in 100, what does the <span class="math inline">\(x\)</span> quantity need to be now? Hint: We want <span class="math inline">\(\mbox{Pr}(S&lt;0) = 0.01\)</span>. Note that you can add subtract constants to both side of an inequality and the probability does not change: <span class="math inline">\(\mbox{Pr}(S&lt;0) = \mbox{Pr}(S+k&lt;0+k)\)</span>, Similarly, with division of positive constants: <span class="math inline">\(\mbox{Pr}(S+k&lt;0+k) = \mbox{Pr}((S+k)/m &lt;k/m)\)</span>. Use this fact and the CLT to transform the left side of the inequality in <span class="math inline">\(\mbox{Pr}(S&lt;0)\)</span> into a standard normal.</p>
-<p>26. Our interest rate now goes up. But it is still a very competitive interest rate. For the <span class="math inline">\(x\)</span> you obtained in 25, what is expected profit per loan and the expected total profit?</p>
+<p>25. Suppose that if your bank has negative profit, it has to close. Therefore, you need to increase <span class="math inline">\(x\)</span> to minimize this risk. However, setting the interest rates too high may lead your clients to choose another bank. So, let’s say that we want our chances of losing money to be 1 in 100. What does the <span class="math inline">\(x\)</span> quantity need to be now? Hint: We want <span class="math inline">\(\mbox{Pr}(S&lt;0) = 0.01\)</span>. Note that you can add subtract constants to both side of an inequality, and the probability does not change: <span class="math inline">\(\mbox{Pr}(S&lt;0) = \mbox{Pr}(S+k&lt;0+k)\)</span>, Similarly, with division of positive constants: <span class="math inline">\(\mbox{Pr}(S+k&lt;0+k) = \mbox{Pr}((S+k)/m &lt;k/m)\)</span>. Use this fact and the CLT to transform the left side of the inequality in <span class="math inline">\(\mbox{Pr}(S&lt;0)\)</span> into a standard normal.</p>
+<p>26. Our interest rate now increases. But it is still a very competitive interest rate. For the <span class="math inline">\(x\)</span> you obtained in 25, what is expected profit per loan and the expected total profit?</p>
 <p>27. Run run a Monte Carlo simulation to double check the theoretical approximation used in 25 and 26.</p>
-<p>28. One of your employees points out that since the bank is making a profit per loan, the bank should give out more loans! Why just <span class="math inline">\(n\)</span>? You explain that finding those <span class="math inline">\(n\)</span> clients was hard. You need a group that is predictable and that keeps the chances of defaults low. He then points out that even if the probability of default is higher, as long as our expected value is positive, you can minimize your chances of losses by increasing <span class="math inline">\(n\)</span> and relying on the law of large numbers. Suppose the default probability is twice as high, or 4%, and you set the interest rate to 5%, or <span class="math inline">\(x=\)</span>$9,000, what is your expected profit per loan?</p>
+<p>28. One of your employees points out that, since the bank is making a profit per loan, the bank should give out more loans! Why limit it to just <span class="math inline">\(n\)</span>? You explain that finding those <span class="math inline">\(n\)</span> clients was hard. You need a group that is predictable and that keeps the chances of defaults low. The employee then points out that even if the probability of default is higher, as long as our expected value is positive, you can minimize your chances of losses by increasing <span class="math inline">\(n\)</span> and relying on the law of large numbers. Suppose the default probability is twice as high, or 4%, and you set the interest rate to 5%, or <span class="math inline">\(x=\)</span>$9,000, what is your expected profit per loan?</p>
 <p>29. How much do we have to increase <span class="math inline">\(n\)</span> by to assure the probability of losing money is still less than 0.01?</p>
-<p>30. Confirm the result in 29 with a Monte Carlo simulations.</p>
-<p>31. According to this equation, giving out more loans increases your expected profit and lowers the chances of losing money! Giving out more loans seems like a no brainier. As a result, your colleague decides to leave your bank and start his own high-risk mortgage company. A few months later, your colleague’s bank has gone bankrupt. A book is written and eventually the movies The Big Short and Margin Call are made relating the mistake your friend, and many others, made. What happened?</p>
+<p>30. Confirm the result in exercise 29 with a Monte Carlo simulation.</p>
+<p>31. According to this equation, giving out more loans increases your expected profit and lowers the chances of losing money! Giving out more loans seems like a no-brainier. As a result, your colleague decides to leave your bank and start his own high-risk mortgage company. A few months later, your colleague’s bank has gone bankrupt. A book is written, and eventually, the movies “The Big Short” and “Margin Call” are made, recounting the mistake your friend, and many others, made. What happened?</p>
 <p>Your colleague’s scheme was mainly based on this mathematical formula <span class="math inline">\(\mbox{SE}\left(\bar{X}\right) = \sigma / \sqrt{n}\)</span>. By making <span class="math inline">\(n\)</span> large, we minimize the standard error of our per-loan profit. However, for this rule to hold, the <span class="math inline">\(X\)</span>s must be independent draws: one person defaulting must be independent of others defaulting.</p>
-<p>To construct a more realistic simulation than the original one your colleague ran, let’s assume there is a global event that affects everybody with high-risk mortgages and changes their probability. We will assume that with 50-50 chance, all the probabilities go up or down slightly to somewhere between 0.03 and 0.05. But it happens to everybody at once, not just one person. These draws are no longer independent so our equation does not apply. Write a Monte Carlo simulation for your total profit with this model.</p>
-<p>32. Use the simulation results to report the expected profit, the probability of losing money, and the probability of losing more than $10,000,000. Study the distribution of profit and discuss how making the wrong assumption lead to a catastrophic result,</p>
+<p>To construct a more realistic simulation than the original one your colleague ran, let’s assume there is a global event affecting everybody with high-risk mortgages and altering their probability simultaneously. We will assume that with a 50-50 chance all the default probabilities slightly increase or decrease to somewhere between 0.03 and 0.05. However, this change occurs universally, impacting everybody at once, not just one person. As these draws are no longer independent, our equation for the standard error of the sum of random varaibles does not apply. Write a Monte Carlo simulation for your total profit with this model.</p>
+<p>32. Use the simulation results to report the expected profit, the probability of losing money, and the probability of losing more than $10,000,000. Study the distribution of profit and discuss how making the wrong assumption lead to a catastrophic result.</p>
 
 
 </section><section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes"><hr>
diff --git a/docs/search.json b/docs/search.json
index fcc8c72..662f82c 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -4,7 +4,7 @@
     "href": "index.html",
     "title": "Advanced Data Science",
     "section": "",
-    "text": "Preface\nThis is the website for the Advanced Data Science.\nThe website for Introduction to Data Science is here.\nThis book started out as part of the class notes used in the HarvardX Data Science Series1.\nA hardcopy version of the first edition of the book is available from CRC Press2.\nA free PDF of the October 24, 2019 version of the book is available from Leanpub3.\nThe Quarto code used to generate the book is available on GitHub4. Note that, the graphical theme used for plots throughout the book can be recreated using the ds_theme_set() function from dslabs package.\nThis work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International CC BY-NC-SA 4.0.\nWe make announcements related to the book on Twitter. For updates follow @rafalab.\nA special thanks to my tidyverse guru David Robinson and Amy Gill for dozens of comments, edits, and suggestions. Also, many thanks to Stephanie Hicks who twice served as a co-instructor in my data science classes and Yihui Xie who patiently put up with my many questions about bookdown. Thanks also to Héctor Corrada-Bravo, for advice on how to best teach machine learning. Thanks to Alyssa Frazee for helping create the homework problem that became the Recommendation Systems case study. Also, many thanks to Hadley Wickham, Mine Çetinkaya-Rundel, and Garrett Grolemund for making the Quarto code for their R for Data Science book open. Finally, thanks to Alex Nones for proofreading the manuscript during its various stages.\nThis book was conceived during the teaching of several applied statistics courses, starting over fifteen years ago. The teaching assistants working with me throughout the years made important indirect contributions to this book. The latest iteration of this course is a HarvardX series coordinated by Heather Sternshein and Zofia Gajdos. We thank them for their contributions. We are also grateful to all the students whose questions and comments helped us improve the book. The courses were partially funded by NIH grant R25GM114818. We are very grateful to the National Institutes of Health for its support.\nA special thanks goes to all those who edited the book via GitHub pull requests or made suggestions by creating an issue or sending an email: nickyfoto (Huang Qiang), desautm (Marc-André Désautels), michaschwab (Michail Schwab), alvarolarreategui (Alvaro Larreategui), jakevc (Jake VanCampen), omerta (Guillermo Lengemann), espinielli (Enrico Spinielli), asimumba(Aaron Simumba), braunschweig (Maldewar), gwierzchowski (Grzegorz Wierzchowski), technocrat (Richard Careaga), atzakas, defeit (David Emerson Feit), shiraamitchell (Shira Mitchell), Nathalie-S, andreashandel (Andreas Handel), berkowitze (Elias Berkowitz), Dean-Webb (Dean Webber), mohayusuf, jimrothstein, mPloenzke (Matthew Ploenzke), NicholasDowand (Nicholas Dow), kant (Darío Hereñú), debbieyuster (Debbie Yuster), tuanchauict (Tuan Chau), phzeller, BTJ01 (BradJ), glsnow (Greg Snow), mberlanda (Mauro Berlanda), wfan9, larswestvang (Lars Westvang), jj999 (Jan Andrejkovic), Kriegslustig (Luca Nils Schmid), odahhani, aidanhorn (Aidan Horn), atraxler (Adrienne Traxler), alvegorova,wycheong (Won Young Cheong), med-hat (Medhat Khalil), kengustafson, Yowza63, ryan-heslin (Ryan Heslin), raffaem, tim8west, David D. Kane, El Mustapha El Abbassi, Vadim Zipunnikov, Anna Quaglieri, Chris Dong, Rick Schoenberg, and Isabella Grabski."
+    "text": "Preface\nThis is the website for the Advanced Data Science.\nThe website for Introduction to Data Science is here.\nThis book started out as part of the class notes used in the HarvardX Data Science Series1.\nA hardcopy version of the first edition of the book, which combined both Introduction and Advanced parts, is available from CRC Press2.\nA free PDF of the October 24, 2019 version of the book, which combined both Introduction and Advanced parts, is available from Leanpub3.\nThe Quarto code used to generate the book is available on GitHub4. Note that, the graphical theme used for plots throughout the book can be recreated using the ds_theme_set() function from dslabs package.\nThis work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International CC BY-NC-SA 4.0.\nWe make announcements related to the book on Twitter. For updates follow @rafalab.\nA special thanks to my tidyverse guru David Robinson and Amy Gill for dozens of comments, edits, and suggestions. Also, many thanks to Stephanie Hicks who twice served as a co-instructor in my data science classes and Yihui Xie who patiently put up with my many questions about bookdown. Thanks also to Héctor Corrada-Bravo, for advice on how to best teach machine learning. Thanks to Alyssa Frazee for helping create the homework problem that became the Recommendation Systems case study. Also, many thanks to Hadley Wickham, Mine Çetinkaya-Rundel, and Garrett Grolemund for making the Quarto code for their R for Data Science book open. Finally, thanks to Alex Nones for proofreading the manuscript during its various stages.\nThis book was conceived during the teaching of several applied statistics courses, starting over fifteen years ago. The teaching assistants working with me throughout the years made important indirect contributions to this book. The latest iteration of this course is a HarvardX series coordinated by Heather Sternshein and Zofia Gajdos. We thank them for their contributions. We are also grateful to all the students whose questions and comments helped us improve the book. The courses were partially funded by NIH grant R25GM114818. We are very grateful to the National Institutes of Health for its support.\nA special thanks goes to all those who edited the book via GitHub pull requests or made suggestions by creating an issue or sending an email: nickyfoto (Huang Qiang), desautm (Marc-André Désautels), michaschwab (Michail Schwab), alvarolarreategui (Alvaro Larreategui), jakevc (Jake VanCampen), omerta (Guillermo Lengemann), espinielli (Enrico Spinielli), asimumba(Aaron Simumba), braunschweig (Maldewar), gwierzchowski (Grzegorz Wierzchowski), technocrat (Richard Careaga), atzakas, defeit (David Emerson Feit), shiraamitchell (Shira Mitchell), Nathalie-S, andreashandel (Andreas Handel), berkowitze (Elias Berkowitz), Dean-Webb (Dean Webber), mohayusuf, jimrothstein, mPloenzke (Matthew Ploenzke), NicholasDowand (Nicholas Dow), kant (Darío Hereñú), debbieyuster (Debbie Yuster), tuanchauict (Tuan Chau), phzeller, BTJ01 (BradJ), glsnow (Greg Snow), mberlanda (Mauro Berlanda), wfan9, larswestvang (Lars Westvang), jj999 (Jan Andrejkovic), Kriegslustig (Luca Nils Schmid), odahhani, aidanhorn (Aidan Horn), atraxler (Adrienne Traxler), alvegorova,wycheong (Won Young Cheong), med-hat (Medhat Khalil), kengustafson, Yowza63, ryan-heslin (Ryan Heslin), raffaem, tim8west, David D. Kane, El Mustapha El Abbassi, Vadim Zipunnikov, Anna Quaglieri, Chris Dong, Rick Schoenberg, Isabella Grabski, and Doug Snyder."
   },
   {
     "objectID": "index.html#footnotes",
@@ -18,112 +18,112 @@
     "href": "intro.html#who-will-find-this-book-useful",
     "title": "Introduction",
     "section": "Who will find this book useful?",
-    "text": "Who will find this book useful?\nThis book is meant to be a textbook for a second course in Data Science. Previous knowledge of R, such as that covered in Introduction to Data Science, is necessary. If you read and understand all the chapters and complete all the exercises in this book, you will be well-positioned to perform advanced data analysis tasks and you will be prepared to learn the more advanced concepts and skills needed to become an expert."
+    "text": "Who will find this book useful?\nThis book is meant to be a textbook for a second course in Data Science with a focus on data analysis. Previous knowledge of R, such as that covered in Introduction to Data Science, is necessary. If you read and understand all the chapters and complete all the exercises in this book, you will be well-positioned to perform advanced data analysis tasks and you will be prepared to learn the more advanced concepts and skills needed to become an expert."
   },
   {
     "objectID": "intro.html#what-is-not-covered-by-this-book",
     "href": "intro.html#what-is-not-covered-by-this-book",
     "title": "Introduction",
     "section": "What is not covered by this book?",
-    "text": "What is not covered by this book?\nThis book focuses on the application of statistical and machine learning methods in data analysis. We do not go in depth into the theoretical aspects of the methods, and highly recommend complementing this book with probability and statistics textbooks."
+    "text": "What is not covered by this book?\nThis book focuses on the application of statistical and machine learning methods in data analysis. We do not go in depth into the theoretical aspects of the methods, and highly recommend complementing this book with probability and statistics textbooks. We also do not cover aspects related to data management or engineering. Although R programming is an essential part of the book, we do not teach more advanced computer science topics such as data structures, optimization, and algorithm theory. Similarly, we do not cover topics such as web services, interactive graphics, parallel computing, and data streaming processing."
   },
   {
     "objectID": "intro.html#footnotes",
     "href": "intro.html#footnotes",
     "title": "Introduction",
     "section": "",
-    "text": "https://github.com/rafalab/dsbook-part-2↩︎"
+    "text": "https://hbr.org/2012/10/data-scientist-the-sexiest-job-of-the-21st-century↩︎\nhttps://github.com/rafalab/dsbook-part-2↩︎"
   },
   {
     "objectID": "summaries/intro-summaries.html",
     "href": "summaries/intro-summaries.html",
     "title": "Summary statistics",
     "section": "",
-    "text": "We start by describing a simple yet powerful data analysis technique: constructing data summaries. Although the approach does not require mathematical models or probability, the motivation for the summaries we describe will later help us understand both these topics.\nYou have likely noticed that numerical data is often summarized with the average value. For example, the quality of a high school is sometimes summarized with one number: the average score on a standardized test. Occasionally, a second number is reported: the standard deviation. For example, you might read a report stating that scores were 680 plus or minus 50, with 50 the standard deviation. The report has summarized the entirety of scores with just two numbers. Is this appropriate? Is there any important piece of information that we are missing by only looking at this summary rather than the entire list? Here we answer these questions and motivate several useful summary statistics and plots, including the average, standard deviation, median, quartiles, histograms, and density plots."
+    "text": "We start by describing a simple yet powerful data analysis technique: constructing data summaries. Although the approach does not require mathematical models or probability, the motivation for the summaries we describe will later help us understand both these topics.\nYou have likely noticed that numerical data is often summarized with the average value. For example, the quality of a high school is sometimes summarized with one number: the average score on a standardized test. Occasionally, a second number is reported: the standard deviation. For example, you might read a report stating that scores were 680 plus or minus 50, with 50 the standard deviation. The report has summarized the entirety of scores with just two numbers. Is this appropriate? Is there any important piece of information that we are missing by only looking at this summary rather than the entire list? In this section, we answer these questions and motivate several useful summary statistics and plots, including the average, standard deviation, median, quartiles, histograms, and density plots."
   },
   {
     "objectID": "summaries/distributions.html#variable-types",
     "href": "summaries/distributions.html#variable-types",
     "title": "1  Distributions",
     "section": "\n1.1 Variable types",
-    "text": "1.1 Variable types\nWe will be working with two types of variables: categorical and numeric. Each can be divided into two other groups: categorical can be ordinal or not, whereas numerical variables can be discrete or continuous.\nWhen each entry in a vector comes from one of a small number of groups, we refer to the data as categorical data. Two simple examples are sex (male or female) and US regions (Northeast, South, North Central, West). Some categorical data can be ordered even if they are not numbers, such as spiciness (mild, medium, hot). In statistics textbooks, ordered categorical data are referred to as ordinal data.\nExamples of numerical data are population sizes, murder rates, and heights. Some numerical data can be treated as ordered categorical. We can further divide numerical data into continuous and discrete. Continuous variables are those that can take any value, such as heights, if measured with enough precision. For example, a pair of twins may be 68.12 and 68.11 inches, respectively. Counts, such as number of gun murders per year, are discrete because they have to be round numbers.\nKeep in mind that discrete numeric data can be considered ordinal. Although this is technically true, we usually reserve the term ordinal data for variables belonging to a small number of different groups, with each group having many members. In contrast, when we have many groups with few cases in each group, we typically refer to them as discrete numerical variables. So, for example, the number of packs of cigarettes a person smokes a day, rounded to the closest pack, would be considered ordinal, while the actual number of cigarettes would be considered a numerical variable. But, indeed, there are examples that can be considered both numerical and ordinal.\nThe most basic statistical summary of a list of objects or numbers is its distribution. The simplest way to think of a distribution is as a compact description of a list with many entries. This concept should not be new for readers of this book. For example, with categorical data, the distribution simply describes the proportion of each unique category. Here is an example with US state regions:\n\nprop.table(table(state.region))\n#&gt; state.region\n#&gt;     Northeast         South North Central          West \n#&gt;          0.18          0.32          0.24          0.26\n\nWhen the data is numerical, the task of constructing a summary based on the distribution is more challenging. We introduce an artificial, yet illustrative, motivating problem that will help us introduce the concepts needed to understand distributions."
+    "text": "1.1 Variable types\nWe will be working with two types of variables: categorical and numeric. Each can be divided into two other groups: categorical can be ordinal or not, whereas numerical variables can be discrete or continuous.\nWhen each entry in a vector comes from one of a small number of groups, we refer to the data as categorical data. Two simple examples are sex (male or female) and US regions (Northeast, South, North Central, West). Some categorical data can be ordered even if they are not numbers, such as spiciness (mild, medium, hot). In statistics textbooks, these ordered categorical data are referred to as ordinal data.\nExamples of numerical data are population sizes, murder rates, and heights. Some numerical data can be treated as ordered categorical. We can further divide numerical data into continuous and discrete. Continuous variables are those that can take any value, such as heights, if measured with enough precision. For example, a pair of twins may be 68.12 and 68.11 inches, respectively. Counts, such as number of gun murders per year, are discrete because they must be round numbers.\nKeep in mind that discrete numeric data can be considered ordinal. Although this is technically true, we usually reserve the term ordinal data for variables belonging to a small number of different groups, with each group having many members. In contrast, when we have many groups with few cases in each group, we typically refer to them as discrete numerical variables. So, for example, the number of packs of cigarettes a person smokes a day, rounded to the closest pack, would be considered ordinal, while the actual number of cigarettes would be considered a numerical variable. However, there are indeed examples that can be considered both numerical and ordinal.\nThe most basic statistical summary of a list of objects or numbers is its distribution. The simplest way to think of a distribution is as a compact description of a list with many entries. This concept should not be new for readers of this book. For example, with categorical data, the distribution simply describes the proportion of each unique category. Here is an example with US state regions:\n\nprop.table(table(state.region))\n#&gt; state.region\n#&gt;     Northeast         South North Central          West \n#&gt;          0.18          0.32          0.24          0.26\n\nWhen the data is numerical, the task of constructing a summary based on the distribution is more challenging. We introduce an artificial, yet illustrative, motivating problem that will help us introduce the concepts needed to understand distributions."
   },
   {
     "objectID": "summaries/distributions.html#sec-ecdf-intro",
     "href": "summaries/distributions.html#sec-ecdf-intro",
     "title": "1  Distributions",
     "section": "\n1.2 Empirical cumulative distribution functions",
-    "text": "1.2 Empirical cumulative distribution functions\nNumerical data that are not categorical also have distributions. In general, when data is not categorical, reporting the frequency of each entry is not an effective summary since most entries are unique. In our case study, while several students reported a height of 68 inches, only one student reported a height of 68.503937007874 inches and only one student reported a height 68.8976377952756 inches. We assume that they converted from 174 and 175 centimeters, respectively.\nStatistics textbooks teach us that a more useful way to define a distribution for numeric data is to define a function that reports the proportion of the data entries \\(x\\) that are below \\(a\\), for all possible values of \\(a\\). This function is called the empirical cumulative distribution function (eCDF) and often denoted with \\(F\\):\n\\[ F(a) = \\mbox{Proportion of data points that are less than or equal to }a\\]\nHere is a plot of \\(F\\) for the male height data:\n\nlibrary(tidyverse)\nheights |&gt; filter(sex == \"Male\") |&gt; \n  ggplot(aes(height)) + \n  stat_ecdf() +\n  labs(x = \"a\", y = \"F(a)\")\n\n\n\n\n\n\n\nSimilar to what the frequency table does for categorical data, the eCDF defines the distribution for numerical data. From the plot, we can see that 16% of the values are below 65, since \\(F(66)=\\) 0.1637931, or that 84% of the values are below 72, since \\(F(72)=\\) 0.841133, and so on. In fact, we can report the proportion of values between any two heights, say \\(a\\) and \\(b\\), by computing \\(F(b) - F(a)\\). This means that if we send this plot above to ET, he will have all the information needed to reconstruct the entire list. Paraphrasing the expression “a picture is worth a thousand words”, in this case, a picture is as informative as 812 numbers.\nNote: the reason we add the word empirical is because, as we will see in Section 4.1), the cumulative distribution function (CDF can be defined mathematically, meaning without any data."
+    "text": "1.2 Empirical cumulative distribution functions\nNumerical data that are not categorical also have distributions. In general, when data is not categorical, reporting the frequency of each entry is not an effective summary, as most entries are unique. In our case study, while several students reported a height of 68 inches, only one student reported a height of 68.503937007874 inches and only one student reported a height 68.8976377952756 inches. We assume that they converted from 174 and 175 centimeters, respectively.\nStatistics textbooks teach us that a more useful way to define a distribution for numeric data is to define a function that reports the proportion of the data entries \\(x\\) that are below \\(a\\), for all possible values of \\(a\\). This function is called the empirical cumulative distribution function (eCDF) and often denoted with \\(F\\):\n\\[ F(a) = \\mbox{Proportion of data points that are less than or equal to }a\\]\nHere is a plot of \\(F\\) for the male height data:\n\nlibrary(tidyverse)\nheights |&gt; filter(sex == \"Male\") |&gt; \n  ggplot(aes(height)) + \n  stat_ecdf() +\n  labs(x = \"a\", y = \"F(a)\")\n\n\n\n\n\n\n\nSimilar to what the frequency table does for categorical data, the eCDF defines the distribution for numerical data. From the plot, we can see that 16% of the values are below 65, since \\(F(66)=\\) 0.1637931, or that 84% of the values are below 72, since \\(F(72)=\\) 0.841133, and so on. In fact, we can report the proportion of values between any two heights, say \\(a\\) and \\(b\\), by computing \\(F(b) - F(a)\\). This means that if we send this plot above to ET, he will have all the information needed to reconstruct the entire list. Paraphrasing the expression “a picture is worth a thousand words”, in this case, a picture is as informative as 812 numbers.\nNote: the reason we add the word empirical is because, as we will see in Section 4.1), the cumulative distribution function (CDF) can be defined mathematically, meaning without any data."
   },
   {
     "objectID": "summaries/distributions.html#histograms",
     "href": "summaries/distributions.html#histograms",
     "title": "1  Distributions",
     "section": "\n1.3 Histograms",
-    "text": "1.3 Histograms\nAlthough the eCDF concept is widely discussed in statistics textbooks, the summary plot is actually not very popular in practice. The main reason is that it does not easily convey characteristics of interest such as: at what value is the distribution centered? Is the distribution symmetric? What ranges contain 95% of the values? Histograms are much preferred because they greatly facilitate answering such questions. Histograms sacrifice just a bit of information to produce summaries that are much easier to interpret.\nThe simplest way to make a histogram is to divide the span of our data into non-overlapping bins of the same size. Then, for each bin, we count the number of values that fall in that interval. The histogram plots these counts as bars with the base of the bar defined by the intervals. Here is the histogram for the height data splitting the range of values into one inch intervals: \\((49.5, 50.5],(50.5, 51.5],(51.5,52.5],(52.5,53.5],...,(82.5,83.5]\\)\n\nheights |&gt; filter(sex == \"Male\") |&gt; \n  ggplot(aes(height)) + \n  geom_histogram(binwidth = 1, color = \"black\")\n\n\n\n\n\n\n\nAs you can see in the figure above, a histogram is similar to a barplot, but it differs in that the x-axis is numerical, not categorical.\nIf we send this plot to ET, he will immediately learn some important properties about our data. First, the range of the data is from 50 to 84 with the majority (more than 95%) between 63 and 75 inches. Second, the heights are close to symmetric around 69 inches. Also, by adding up counts, ET could obtain a very good approximation of the proportion of the data in any interval. Therefore, the histogram above is not only easy to interpret, but also provides almost all the information contained in the raw list of 812 heights with about 30 bin counts.\nWhat information do we lose? Note that all values in each interval are treated the same when computing bin heights. So, for example, the histogram does not distinguish between 64, 64.1, and 64.2 inches. Given that these differences are almost unnoticeable to the eye, the practical implications are negligible and we were able to summarize the data to just 23 numbers."
+    "text": "1.3 Histograms\nAlthough the eCDF concept is widely discussed in statistics textbooks, the summary plot is actually not very popular in practice. The main reason is that it does not easily convey characteristics of interest, such as at what value is the distribution centered, whether the distribution symmetric, or which ranges contain 95% of the values. Histograms, on the other hand, are much preferred because they greatly facilitate answering such questions. Histograms sacrifice just a bit of information to produce summaries that are much easier to interpret.\nThe simplest way to make a histogram is to divide the span of our data into non-overlapping bins of the same size. Then, for each bin, we count the number of values that fall in that interval. The histogram plots these counts as bars with the base of the bar defined by the intervals. Here is the histogram for the height data splitting the range of values into one inch intervals: \\((49.5, 50.5],(50.5, 51.5],(51.5,52.5],(52.5,53.5],...,(82.5,83.5]\\)\n\nheights |&gt; filter(sex == \"Male\") |&gt; \n  ggplot(aes(height)) + \n  geom_histogram(binwidth = 1, color = \"black\")\n\n\n\n\n\n\n\nAs you can see in the figure above, a histogram is similar to a barplot, but it differs in that the x-axis is numerical, not categorical.\nIf we send this plot to ET, he will immediately learn some important properties about our data. First, the range of the data is from 50 to 84 with the majority (more than 95%) between 63 and 75 inches. Second, the heights are close to symmetric around 69 inches. Also, by adding up counts, ET could obtain a very good approximation of the proportion of the data in any interval. Therefore, the histogram above is not only easy to interpret, but also provides almost all the information contained in the raw list of 812 heights with about 30 bin counts.\nWhat information do we lose? Notice that all values in each interval are treated the same when computing bin heights. So, for example, the histogram does not distinguish between 64, 64.1, and 64.2 inches. Given that these differences are almost unnoticeable to the eye, the practical implications are negligible and we were able to summarize the data to just 23 numbers."
   },
   {
     "objectID": "summaries/distributions.html#smoothed-density",
     "href": "summaries/distributions.html#smoothed-density",
     "title": "1  Distributions",
     "section": "\n1.4 Smoothed density",
-    "text": "1.4 Smoothed density\nSmooth density plots are similar to histograms, but the data is not divided into bins. Here is what a smooth density plot looks like for our heights data:\n\nheights |&gt; \n  filter(sex == \"Male\") |&gt; \n  ggplot(aes(height)) + \n  geom_density(alpha = 0.2, fill = \"#00BFC4\")\n\n\n\n\n\n\n\nIn this plot, we no longer have sharp edges at the interval boundaries and many of the local peaks have been removed. Also, the scale of the y-axis changed from counts to density.\nTo understand the smooth densities, we have to understand estimates, a topic we don’t cover until later. However, we provide a heuristic explanation to help you understand the basics.\nThe main new concept you must understand is that we assume that our list of observed values is a subset of a much larger list of unobserved values. In the case of heights, you can imagine that our list of 812 male students comes from a hypothetical list containing all the heights of all the male students in all the world measured very precisely. Let’s say there are 1,000,000 of these measurements. This list of values has a distribution, like any list of values, and this larger distribution is really what we want to report to ET since it is much more general. Unfortunately, we don’t get to see it.\nHowever, we make an assumption that helps us perhaps approximate it. If we had 1,000,000 values, measured very precisely, we could make a histogram with very, very small bins. The assumption is that if we show this, the height of consecutive bins will be similar. This is what we mean by smooth: we don’t have big jumps in the heights of consecutive bins. Below we have a hypothetical histogram with bins of size 1:\n\n\n\n\n\n\n\n\nThe smaller we make the bins, the smoother the histogram gets. Here are the histograms with bin width of 1, 0.5, and 0.1:\n\n\n\n\n\n\n\n\nThe smooth density is basically the curve that goes through the top of the histogram bars when the bins are very, very small. To make the curve not depend on the hypothetical size of the hypothetical list, we compute the curve on frequencies rather than counts:\n\n\n\n\n\n\n\n\nNow, back to reality. We don’t have millions of measurements. Instead, we have 812 and we can’t make a histogram with very small bins.\nWe therefore make a histogram, using bin sizes appropriate for our data and computing frequencies rather than counts, and we draw a smooth curve that goes through the tops of the histogram bars. The following plots demonstrate the steps that lead to a smooth density:\n\n\n\n\n\n\n\n\nHowever, remember that smooth is a relative term. We can actually control the smoothness of the curve that defines the smooth density through an option in the function that computes the smooth density curve. Here are two examples using different degrees of smoothness on the same histogram:\n\np &lt;- heights |&gt; filter(sex == \"Male\") |&gt; \n  ggplot(aes(height)) +\n  geom_histogram(aes(y = after_stat(density)), binwidth = 1, alpha = 0.5) \n\np1 &lt;- p +  geom_line(stat = 'density', adjust = 0.5)\np2 &lt;- p +  geom_line(stat = 'density', adjust = 2) \n\nlibrary(gridExtra)\n#&gt; \n#&gt; Attaching package: 'gridExtra'\n#&gt; The following object is masked from 'package:dplyr':\n#&gt; \n#&gt;     combine\ngrid.arrange(p1,p2, ncol = 2)\n\n\n\n\n\n\n\nWe need to make this choice with care as the resulting summary can change our interpretation of the data. We should select a degree of smoothness that we can defend as being representative of the underlying data. In the case of height, we really do have reason to believe that the proportion of people with similar heights should be the same. For example, the proportion that is 72 inches should be more similar to the proportion that is 71 than to the proportion that is 78 or 65. This implies that the curve should be pretty smooth; that is, the curve should look more like the example on the right than on the left.\nWhile the histogram is an assumption-free summary, the smoothed density is based on some assumptions.\nNote that interpreting the y-axis of a smooth density plot is not straightforward. It is scaled so that the area under the density curve adds up to 1. If you imagine we form a bin with a base 1 unit in length, the y-axis value tells us the proportion of values in that bin. However, this is only true for bins of size 1. For other size intervals, the best way to determine the proportion of data in that interval is by computing the proportion of the total area contained in that interval. For example, here are the proportion of values between 65 and 68:\n\n\n\n\n\n\n\n\nThe proportion of this area is about 0.3, meaning that about 30% of male heights are between 65 and 68 inches.\nBy understanding this, we are ready to use the smooth density as a summary. For this dataset, we would feel quite comfortable with the smoothness assumption, and therefore with sharing this aesthetically pleasing figure with ET, which he could use to understand our male heights data:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nWith the material covered up to here, you can do exercises 1 through 10."
+    "text": "1.4 Smoothed density\nSmooth density plots are similar to histograms, but the data is not divided into bins. Here is what a smooth density plot looks like for our heights data:\n\nheights |&gt; \n  filter(sex == \"Male\") |&gt; \n  ggplot(aes(height)) + \n  geom_density(alpha = 0.2, fill = \"#00BFC4\")\n\n\n\n\n\n\n\nIn this plot, we no longer have sharp edges at the interval boundaries and many of the local peaks have been removed. Also, the scale of the y-axis changed from counts to density.\nTo understand the smooth densities, we have to understand estimates, a topic we don’t cover until later. However, we provide a heuristic explanation to help you understand the basics.\nThe main new concept you must understand is that we assume that our list of observed values is a subset of a much larger list of unobserved values. In the case of heights, you can imagine that our list of 812 male students comes from a hypothetical list containing all the heights of all the male students in all the world measured very precisely. Let’s say there are 1,000,000 of these measurements. This list of values has a distribution, like any other list of values, and what we truly want to report to ET is this larger distribution, as it is much more general. . Unfortunately, we don’t get to see it.\nHowever, we make an assumption that helps us perhaps approximate it. If we had 1,000,000 values, measured very precisely, we could make a histogram with very, very small bins. The assumption is that if we show this, the height of consecutive bins will be similar. This is what we mean by smooth: we don’t have big jumps in the heights of consecutive bins. Below, we present a hypothetical histogram with bins of size 1:\n\n\n\n\n\n\n\n\nThe smaller we make the bins, the smoother the histogram becomes. Below are the histograms with bin width of 1, 0.5, and 0.1:\n\n\n\n\n\n\n\n\nThe smooth density is basically the curve that goes through the top of the histogram bars when the bins are very, very small. To make the curve not depend on the hypothetical size of the hypothetical list, we compute the curve on frequencies rather than counts:\n\n\n\n\n\n\n\n\nNow, back to reality. We don’t have millions of measurements. Instead, we have 812 and we can’t make a histogram with very small bins.\nTherefore, we make a histogram using bin sizes appropriate for our data, computing frequencies rather than counts. Additionally, we draw a smooth curve that passes through the tops of the histogram bars. The following plots demonstrate the steps that lead to a smooth density:\n\n\n\n\n\n\n\n\nHowever, remember that smooth is a relative term. We can actually control the smoothness of the curve that defines the smooth density through an option in the function that computes the smooth density curve. Here are two examples using different degrees of smoothness on the same histogram:\n\np &lt;- heights |&gt; filter(sex == \"Male\") |&gt; \n  ggplot(aes(height)) +\n  geom_histogram(aes(y = after_stat(density)), binwidth = 1, alpha = 0.5) \n\np1 &lt;- p +  geom_line(stat = 'density', adjust = 0.5)\np2 &lt;- p +  geom_line(stat = 'density', adjust = 2) \n\nlibrary(gridExtra)\n#&gt; \n#&gt; Attaching package: 'gridExtra'\n#&gt; The following object is masked from 'package:dplyr':\n#&gt; \n#&gt;     combine\ngrid.arrange(p1,p2, ncol = 2)\n\n\n\n\n\n\n\nWe need to make this choice with care as the resulting summary can change our interpretation of the data. We should select a degree of smoothness that we can defend as being representative of the underlying data. In the case of height, we really do have reason to believe that the proportion of people with similar heights should be the same. For example, the proportion that is 72 inches should be more similar to the proportion that is 71 than to the proportion that is 78 or 65. This implies that the curve should be relatively smooth, resembling the example on the right more than the one on the left.\nWhile the histogram is an assumption-free summary, the smoothed density is based on some assumptions.\nNote that interpreting the y-axis of a smooth density plot is not straightforward. It is scaled so that the area under the density curve adds up to 1. If you imagine that we form a bin with a base 1 unit in length, the y-axis value tells us the proportion of values in that bin. However, this is only true for bins of size 1. For other size intervals, the best way to determine the proportion of data in that interval is by computing the proportion of the total area contained in that interval. For example, here are the proportion of values between 65 and 68:\n\n\n\n\n\n\n\n\nThe proportion of this area is about 0.3, meaning that about 30% of male heights are between 65 and 68 inches.\nBy understanding this, we are ready to use the smooth density as a summary. For this dataset, we would feel quite comfortable with the smoothness assumption, and therefore with sharing this aesthetically pleasing figure with ET, which he could use to understand our male heights data:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nWith the material covered up to this point, you can complete exercises 1 through 10."
   },
   {
     "objectID": "summaries/distributions.html#sec-normal-distribution",
     "href": "summaries/distributions.html#sec-normal-distribution",
     "title": "1  Distributions",
     "section": "\n1.5 The normal distribution",
-    "text": "1.5 The normal distribution\nHistograms and density plots provide excellent summaries of a distribution. But can we summarize even further? We often see the average and standard deviation used as summary statistics: a two-number summary! To understand what these summaries are and why they are so widely used, we need to understand the normal distribution.\nThe normal distribution, also known as the bell curve and as the Gaussian distribution, is one of the most famous mathematical concepts in history. A reason for this is that approximately normal distributions occur in many situations, including gambling winnings, heights, weights, blood pressure, standardized test scores, and experimental measurement errors. There are explanations for this, but we describe these later. Here we focus on how the normal distribution helps us summarize data.\nRather than using data, the normal distribution is defined with a mathematical formula. For any interval \\((a,b)\\), the proportion of values in that interval can be computed using this formula:\n\\[\\mbox{Pr}(a &lt; x \\leq b) = \\int_a^b \\frac{1}{\\sqrt{2\\pi}\\sigma} e^{-\\frac{1}{2}\\left( \\frac{x-\\mu}{\\sigma} \\right)^2} \\, dx\\]\nYou don’t need to memorize or understand the details of the formula. But note that it is completely defined by just two parameters: \\(\\mu\\) and \\(\\sigma\\). The rest of the symbols in the formula represent the interval ends, \\(a\\) and \\(b\\), and known mathematical constants \\(\\pi\\) and \\(e\\). These two parameters, \\(\\mu\\) and \\(\\sigma\\), are referred to as the average (also called the mean) and the standard deviation (SD) of the distribution, respectively (and are the greek letters for \\(m\\) and \\(s\\)).\nThe distribution is symmetric, centered at the average, and most values (about 95%) are within 2 SDs from the average. Here is what the normal distribution looks like when the average is 0 and the SD is 1:\n\n\n\n\n\n\n\n\nThe fact that the distribution is defined by just two parameters implies that if a dataset is approximated by a normal distribution, all the information needed to describe the distribution can be encoded in just two numbers: the average and the standard deviation. We now define these values for an arbitrary list of numbers.\nFor a list of numbers contained in a vector x, the average is defined as:\n\nm &lt;- sum(x) / length(x)\n\nand the SD is defined as:\n\ns &lt;- sqrt(sum((x - m)^2) / length(x))\n\nwhich can be interpreted as the average distance between values and their average.\nLet’s compute the values for the height for males which we will store in the object x:\n\nindex &lt;- heights$sex == \"Male\"\nx &lt;- heights$height[index]\n\nThe pre-built functions mean and sd can be used here:\n\nm &lt;- mean(x)\ns &lt;- sd(x)\n\n\n\n\n\n\n\nFor reasons explained in Section Section 10.2.1, sd(x) divides by length(x)-1 rather than length(x). But note that when length(x) is large, sd(x) and sqrt(sum((x-mu)^2) / length(x)) are practically equal.\n\n\n\nHere is a plot of the smooth density and the normal distribution with mean = 69.3 and SD = 3.6 plotted as a black line with our student height smooth density in blue:\n\n\n\n\n\n\n\n\nThe normal distribution does appear to be quite a good approximation here. We now will see how well this approximation works at predicting the proportion of values within intervals."
+    "text": "1.5 The normal distribution\nHistograms and density plots provide excellent summaries of a distribution. But can we summarize even further? We often see the average and standard deviation used as summary statistics: a two-number summary! To understand what these summaries are and why they are so widely used, we need to understand the normal distribution.\nThe normal distribution, also known as the bell curve and as the Gaussian distribution, is one of the most famous mathematical concepts in history. One reason for this is that approximately normal distributions occur in many situations, including gambling winnings, heights, weights, blood pressure, standardized test scores, and experimental measurement errors. There are explanations for these occurrences, which we will describe later. Here we focus on how the normal distribution helps us summarize data.\nRather than using data, the normal distribution is defined with a mathematical formula. For any interval \\((a,b)\\), the proportion of values in that interval can be computed using this formula:\n\\[\\mbox{Pr}(a &lt; x \\leq b) = \\int_a^b \\frac{1}{\\sqrt{2\\pi}\\sigma} e^{-\\frac{1}{2}\\left( \\frac{x-\\mu}{\\sigma} \\right)^2} \\, dx\\]\nYou don’t need to memorize or understand the details of the formula. However, it is important to note that it is completely defined by just two parameters: \\(\\mu\\) and \\(\\sigma\\). The rest of the symbols in the formula represent the interval ends, \\(a\\) and \\(b\\), and known mathematical constants \\(\\pi\\) and \\(e\\). These two parameters, \\(\\mu\\) and \\(\\sigma\\), are referred to as the average (also called the mean) and the standard deviation (SD) of the distribution, respectively (and are the Greek letters for \\(m\\) and \\(s\\)).\nThe distribution is symmetric, centered at the average, and most values (about 95%) are within 2 SDs from the average. Here is what the normal distribution looks like when the average is 0 and the SD is 1:\n\n\n\n\n\n\n\n\nThe fact that the distribution is defined by just two parameters implies that if a dataset is approximated by a normal distribution, all the information needed to describe the distribution can be encoded in just two numbers: the average and the standard deviation. We now define these values for an arbitrary list of numbers.\nFor a list of numbers contained in a vector x, the average is defined as:\n\nm &lt;- sum(x) / length(x)\n\nand the SD is defined as:\n\ns &lt;- sqrt(sum((x - m)^2) / length(x))\n\nwhich can be interpreted as the average distance between values and their average.\nLet’s compute the values for the height for males which we will store in the object x:\n\nindex &lt;- heights$sex == \"Male\"\nx &lt;- heights$height[index]\n\nThe pre-built functions mean and sd can be used here:\n\nm &lt;- mean(x)\ns &lt;- sd(x)\n\n\n\n\n\n\n\nFor reasons explained in Section 11.2.1, sd(x) divides by length(x)-1 rather than length(x). But note that when length(x) is large, sd(x) and sqrt(sum((x-mu)^2) / length(x)) are practically equal.\n\n\n\nHere is a plot of the smooth density and the normal distribution with mean = 69.3 and SD = 3.6 plotted as a black line with our student height smooth density in blue:\n\n\n\n\n\n\n\n\nThe normal distribution does appear to be quite a good approximation here. We will now see how well this approximation works at predicting the proportion of values within intervals."
   },
   {
     "objectID": "summaries/distributions.html#standard-units",
     "href": "summaries/distributions.html#standard-units",
     "title": "1  Distributions",
     "section": "\n1.6 Standard units",
-    "text": "1.6 Standard units\nFor data that is approximately normally distributed, it is convenient to think in terms of standard units. The standard unit of a value tells us how many standard deviations away from the average it is. Specifically, for a value x from a vector X, we define the value of x in standard units as z = (x - m)/s with m and s the average and standard deviation of X, respectively. Why is this convenient?\nFirst look back at the formula for the normal distribution and note that what is being exponentiated is \\(-z^2/2\\) with \\(z\\) equivalent to \\(x\\) in standard units. Because the maximum of \\(e^{-z^2/2}\\) is when \\(z = 0\\), this explains why the maximum of the distribution occurs at the average. It also explains the symmetry since \\(- z^2/2\\) is symmetric around 0. Second, note that if we convert the normally distributed data to standard units, we can quickly know if, for example, a person is about average (\\(z = 0\\)), one of the largest (\\(z \\approx 2\\)), one of the smallest (\\(z \\approx -2\\)), or an extremely rare occurrence (\\(z &gt; 3\\) or \\(z &lt; -3\\)). Remember that it does not matter what the original units are, these rules apply to any data that is approximately normal.\nIn R, we can obtain standard units using the function scale:\n\nz &lt;- scale(x)\n\nNow to see how many men are within 2 SDs from the average, we simply type:\n\nmean(abs(z) &lt; 2)\n#&gt; [1] 0.95\n\nThe proportion is about 95%, which is what the normal distribution predicts! To further confirm that, in fact, the approximation is a good one, we can use quantile-quantile plots."
+    "text": "1.6 Standard units\nFor data that is approximately normally distributed, it is convenient to think in terms of standard units. The standard unit of a value tells us how many standard deviations away from the average it is. Specifically, for a value x from a vector X, we define the value of x in standard units as z = (x - m)/s with m and s the average and standard deviation of X, respectively. Why is this convenient?\nFirst, revisit the formula for the normal distribution and observe that what is being exponentiated is \\(-z^2/2\\) with \\(z\\) equivalent to \\(x\\) in standard units. Because the maximum of \\(e^{-z^2/2}\\) is when \\(z = 0\\), this explains why the maximum of the distribution occurs at the average. It also explains the symmetry since \\(- z^2/2\\) is symmetric around 0. Secondly, note that by converting the normally distributed data to standard units, we can quickly ascertain whether, for example, a person is about average (\\(z = 0\\)), one of the largest (\\(z \\approx 2\\)), one of the smallest (\\(z \\approx -2\\)), or an extremely rare occurrence (\\(z &gt; 3\\) or \\(z &lt; -3\\)). Remember that it does not matter what the original units are, these rules apply to any data that is approximately normal.\nIn R, we can obtain standard units using the function scale:\n\nz &lt;- scale(x)\n\nTo see how many men are within 2 SDs from the average, we simply type:\n\nmean(abs(z) &lt; 2)\n#&gt; [1] 0.95\n\nThe proportion is about 95%, which is what the normal distribution predicts! To further validate this approximation, we can use quantile-quantile plots."
   },
   {
     "objectID": "summaries/distributions.html#quantile-quantile-plots",
     "href": "summaries/distributions.html#quantile-quantile-plots",
     "title": "1  Distributions",
     "section": "\n1.7 Quantile-quantile plots",
-    "text": "1.7 Quantile-quantile plots\nA systematic way to assess how well the normal distribution fits the data is to check if the observed and predicted proportions match. In general, this is the approach of the quantile-quantile plot (QQ-plot).\nFirst let’s define the theoretical quantiles for the normal distribution. In statistics books we use the symbol \\(\\Phi(x)\\) to define the function that gives us the proportion of a standard normal distributed data that are smaller than \\(x\\). So, for example, \\(\\Phi(-1.96) = 0.025\\) and \\(\\Phi(1.96) = 0.975\\). In R, we can evaluate \\(\\Phi\\) using the pnorm function:\n\npnorm(-1.96)\n#&gt; [1] 0.025\n\nThe inverse function \\(\\Phi^{-1}(x)\\) gives us the theoretical quantiles for the normal distribution. So, for example, \\(\\Phi^{-1}(0.975) = 1.96\\). In R, we can evaluate the inverse of \\(\\Phi\\) using the qnorm function.\n\nqnorm(0.975)\n#&gt; [1] 1.96\n\nNote that these calculations are for the standard normal distribution by default (mean = 0, standard deviation = 1), but we can also define these for any normal distribution. We can do this using the mean and sd arguments in the pnorm and qnorm function. For example, we can use qnorm to determine quantiles of a distribution with a specific average and standard deviation\n\nqnorm(0.975, mean = 5, sd = 2)\n#&gt; [1] 8.92\n\nFor the normal distribution, all the calculations related to quantiles are done without data, thus the name theoretical quantiles. But quantiles can be defined for any distribution, including an empirical one. So if we have data in a vector \\(x\\), we can define the quantile associated with any proportion \\(p\\) as the \\(q\\) for which the proportion of values below \\(q\\) is \\(p\\). Using R code, we can define q as the value for which mean(x &lt;= q) = p. Notice that not all \\(p\\) have a \\(q\\) for which the proportion is exactly \\(p\\). There are several ways of defining the best \\(q\\) as discussed in the help for the quantile function.\nTo give a quick example, for the male heights data, we have that:\n\nmean(x &lt;= 69.5)\n#&gt; [1] 0.515\n\nSo about 50% are shorter or equal to 69 inches. This implies that if \\(p = 0.50\\) then \\(q = 69.5\\).\nThe idea of a QQ-plot is that if your data is well approximated by normal distribution then the quantiles of your data should be similar to the quantiles of a normal distribution. To construct a QQ-plot, we do the following:\n\nDefine a vector of \\(m\\) proportions \\(p_1, p_2, \\dots, p_m\\).\nDefine a vector of quantiles \\(q_1, \\dots, q_m\\) for your data for the proportions \\(p_1, \\dots, p_m\\). We refer to these as the sample quantiles.\nDefine a vector of theoretical quantiles for the proportions \\(p_1, \\dots, p_m\\) for a normal distribution with the same average and standard deviation as the data.\nPlot the sample quantiles versus the theoretical quantiles.\n\nLet’s construct a QQ-plot using R code. Start by defining the vector of proportions.\n\np &lt;- seq(0.05, 0.95, 0.05)\n\nTo obtain the quantiles from the data, we can use the quantile function like this:\n\nsample_quantiles &lt;- quantile(x, p)\n\nTo obtain the theoretical normal distribution quantiles with the corresponding average and SD, we use the qnorm function:\n\ntheoretical_quantiles &lt;- qnorm(p, mean = mean(x), sd = sd(x))\n\nTo see if they match or not, we plot them against each other and draw the identity line:\n\nqplot(theoretical_quantiles, sample_quantiles) + geom_abline()\n#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.\n\n\n\n\n\n\n\nNotice that this code becomes much cleaner if we use standard units:\n\nsample_quantiles &lt;- quantile(z, p)\ntheoretical_quantiles &lt;- qnorm(p) \nqplot(theoretical_quantiles, sample_quantiles) + geom_abline()\n\nThe above code is included to help describe QQ-plots. However, in practice it is easier to use ggplot2 code:\n\nheights |&gt; filter(sex == \"Male\") |&gt;\n  ggplot(aes(sample = scale(height))) + \n  geom_qq() +\n  geom_abline()\n\nWhile for the illustration above we used 20 quantiles, the default from the geom_qq function is to use as many quantiles as data points.\nNote that although here we used qqplots to compare an observed distribution to the mathamatically defeinde normal distribution, QQ-plots can be used to compare any two distributions."
+    "text": "1.7 Quantile-quantile plots\nA systematic way to assess how well the normal distribution fits the data is to check if the observed and predicted proportions match. In general, this is the approach of the quantile-quantile plot (qqplot).\nFirst, let’s define the theoretical quantiles for the normal distribution. In statistics books, we use the symbol \\(\\Phi(x)\\) to define the function that gives us the proportion of a standard normal distributed data that are smaller than \\(x\\). So, for example, \\(\\Phi(-1.96) = 0.025\\) and \\(\\Phi(1.96) = 0.975\\). In R, we can evaluate \\(\\Phi\\) using the pnorm function:\n\npnorm(-1.96)\n#&gt; [1] 0.025\n\nThe inverse function \\(\\Phi^{-1}(x)\\) gives us the theoretical quantiles for the normal distribution. Thus, for instance, \\(\\Phi^{-1}(0.975) = 1.96\\). In R, we can evaluate the inverse of \\(\\Phi\\) using the qnorm function.\n\nqnorm(0.975)\n#&gt; [1] 1.96\n\nNote that these calculations are for the standard normal distribution by default (mean = 0, standard deviation = 1), but we can also define these for any normal distribution. We can do this using the mean and sd arguments in the pnorm and qnorm function. For example, we can use qnorm to determine quantiles of a distribution with a specific average and standard deviation\n\nqnorm(0.975, mean = 5, sd = 2)\n#&gt; [1] 8.92\n\nFor the normal distribution, all the calculations related to quantiles are done without data, hence the name theoretical quantiles. But quantiles can be defined for any distribution, including an empirical one. If we therefore have data in a vector \\(x\\), we can define the quantile associated with any proportion \\(p\\) as the \\(q\\) for which the proportion of values below \\(q\\) is \\(p\\). Using R code, we can define q as the value for which mean(x &lt;= q) = p. Notice that not all \\(p\\) have a \\(q\\) for which the proportion is exactly \\(p\\). There are several ways of defining the best \\(q\\) as discussed in the help for the quantile function.\nTo give a quick example, for the male heights data, we have that:\n\nmean(x &lt;= 69.5)\n#&gt; [1] 0.515\n\nTherefore about 50% are shorter or equal to 69 inches. This implies that if \\(p = 0.50\\), then \\(q = 69.5\\).\nThe idea of a qqplot is that if your data is well approximated by normal distribution, then the quantiles of your data should be similar to the quantiles of a normal distribution. To construct a qqplot, we do the following:\n\nDefine a vector of \\(m\\) proportions \\(p_1, p_2, \\dots, p_m\\).\nDefine a vector of quantiles \\(q_1, \\dots, q_m\\) for your data for the proportions \\(p_1, \\dots, p_m\\). We refer to these as the sample quantiles.\nDefine a vector of theoretical quantiles for the proportions \\(p_1, \\dots, p_m\\) for a normal distribution with the same average and standard deviation as the data.\nPlot the sample quantiles versus the theoretical quantiles.\n\nLet’s construct a qqplot using R code. Start by defining the vector of proportions.\n\np &lt;- seq(0.05, 0.95, 0.05)\n\nTo obtain the quantiles from the data, we can use the quantile function like this:\n\nsample_quantiles &lt;- quantile(x, p)\n\nTo obtain the theoretical normal distribution quantiles with the corresponding average and SD, we use the qnorm function:\n\ntheoretical_quantiles &lt;- qnorm(p, mean = mean(x), sd = sd(x))\n\nTo see if they match or not, we plot them against each other and draw the identity line:\n\nqplot(theoretical_quantiles, sample_quantiles) + geom_abline()\n#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.\n\n\n\n\n\n\n\nNotice that this code becomes much cleaner if we use standard units:\n\nsample_quantiles &lt;- quantile(z, p)\ntheoretical_quantiles &lt;- qnorm(p) \nqplot(theoretical_quantiles, sample_quantiles) + geom_abline()\n\nThe above code is included to help describe qqplots. However, in practice it is easier to use ggplot2 code:\n\nheights |&gt; filter(sex == \"Male\") |&gt;\n  ggplot(aes(sample = scale(height))) + \n  geom_qq() +\n  geom_abline()\n\nIn the illustration above we used 20 quantiles; however, the default for the geom_qq function is to use as many quantiles as data points.\nAlthough here we used qqplots to compare an observed distribution to the mathematically defined normal distribution, qqplots can be used to compare any two distributions."
   },
   {
     "objectID": "summaries/distributions.html#percentiles",
     "href": "summaries/distributions.html#percentiles",
     "title": "1  Distributions",
     "section": "\n1.8 Percentiles",
-    "text": "1.8 Percentiles\nBefore we move on, let’s define some terms that are commonly used in exploratory data analysis.\nPercentiles are special cases of quantiles that are commonly used. The percentiles are the quantiles you obtain when setting the \\(p\\) at \\(0.01, 0.02, ..., 0.99\\). We call, for example, the case of \\(p = 0.25\\) the 25th percentile, which gives us a number for which 25% of the data is below. The most famous percentile is the 50th, also known as the median.\nFor the normal distribution the median and average are the same, but this is generally not the case.\nAnother special case that receives a name are the quartiles, which are obtained when setting \\(p = 0.25,0.50\\), and \\(0.75\\)."
+    "text": "1.8 Percentiles\nBefore we move on, let’s define some terms that are commonly used in exploratory data analysis.\nPercentiles are special cases of quantiles that are commonly used. The percentiles are the quantiles you obtain when setting the \\(p\\) at \\(0.01, 0.02, ..., 0.99\\). For example, we refer to the case of \\(p = 0.25\\) as the 25th percentile, representing a value below which 25% of the data falls. The most famous percentile is the 50th, also known as the median.\nFor the normal distribution, the median and average are the same, but this is generally not the case.\nAnother special case that receives a name are the quartiles, which are obtained when setting \\(p = 0.25,0.50\\), and \\(0.75\\)."
   },
   {
     "objectID": "summaries/distributions.html#boxplots",
     "href": "summaries/distributions.html#boxplots",
     "title": "1  Distributions",
     "section": "\n1.9 Boxplots",
-    "text": "1.9 Boxplots\nTo introduce boxplots we will use a dataset of US murders by state. Suppose we want to summarize the murder rate distribution. Using the techniques we have learned, we can quickly see that the normal approximation does not apply here:\n\n\n\n\n\n\n\n\nIn this case, the histogram above or a smooth density plot would serve as a relatively succinct summary.\nNow suppose those used to receiving just two numbers as summaries ask us for a more compact numerical summary.\nThe boxplot provides a five-number summary composed of the range along with the quartiles (the 25th, 50th, and 75th percentiles). The boxplot often ignore outliers when computing the range and instead plot these as independent points. We provide a detailed explanation of outliers later. Finally, he suggested we plot these numbers as a “box” with “whiskers” like this:\n\n\n\n\n\n\n\n\nwith the box defined by the 25% and 75% percentile and the whiskers showing the range. The distance between these two is called the interquartile range. The two points are considered outliers by the default R function we used. The median is shown with a horizontal line. Today, we call these boxplots.\nFrom just this simple plot, we know that the median is about 2.5, that the distribution is not symmetric, and that the range is 0 to 5 for the great majority of states with two exceptions."
+    "text": "1.9 Boxplots\nTo introduce boxplots, we will use a dataset of US murders by state. Suppose we want to summarize the murder rate distribution. Using the techniques we have learned, we can quickly see that the normal approximation does not apply in this case:\n\n\n\n\n\n\n\n\nIn this instance, the histogram above or a smooth density plot would serve as a relatively succinct summary.\nNow suppose those used to receiving just two numbers as summaries ask us for a more compact numerical summary.\nThe boxplot provides a five-number summary composed of the range along with the quartiles (the 25th, 50th, and 75th percentiles). The boxplot often ignores outliers when computing the range and instead plots these as independent points. We will provide a detailed explanation of outliers later. Finally, we plot these numbers as a “box” with “whiskers” like this:\n\n\n\n\n\n\n\n\nwith the box defined by the 25% and 75% percentile and the whiskers showing the range. The distance between these two is called the interquartile range. The two points are considered outliers by the default R function we used. The median represented by a horizontal line. Today, we call these boxplots.\nFrom just this simple plot, we know that the median is about 2.5, that the distribution is not symmetric, and that the range is 0 to 5 for the great majority of states with two exceptions."
   },
   {
     "objectID": "summaries/distributions.html#sec-stratification",
     "href": "summaries/distributions.html#sec-stratification",
     "title": "1  Distributions",
     "section": "\n1.10 Stratification",
-    "text": "1.10 Stratification\nIn data analysis we often divide observations into groups based on the values of one or more variables associated with those observations. For example in the next section we divide the height values into groups based on a sex variable: females and males. We call this procedure stratification and refer to the resulting groups as strata.\nStratification is common in data visualization because we are often interested in how the distribution of variables differs across different subgroups.\nUsing the histogram, density plots, and QQ-plots, we have become convinced that the male height data is well approximated with a normal distribution. In this case, we report back to ET a very succinct summary: male heights follow a normal distribution with an average of 69.3 inches and a SD of 3.6 inches. With this information, ET will have a good idea of what to expect when he meets our male students. However, to provide a complete picture we need to also provide a summary of the female heights.\nWe learned that boxplots are useful when we want to quickly compare two or more distributions. Here are the heights for men and women:\n\nheights |&gt; ggplot(aes(sex, height, fill = sex)) + geom_boxplot()\n\n\n\n\n\n\n\nThe plot immediately reveals that males are, on average, taller than females. The standard deviations appear to be similar. But does the normal approximation also work for the female height data collected by the survey? We expect that they will follow a normal distribution, just like males. However, exploratory plots reveal that the approximation is not as useful:\n\n\n\n\n\n\n\n\nWe see something we did not see for the males: the density plot has a second bump. Also, the QQ-plot shows that the highest points tend to be taller than expected by the normal distribution. Finally, we also see five points in the QQ-plot that suggest shorter than expected heights for a normal distribution. When reporting back to ET, we might need to provide a histogram rather than just the average and standard deviation for the female heights.\nWe have noticed what we didn’t expect to see. If we look at other female height distributions, we do find that they are well approximated with a normal distribution. So why are our female students different? Is our class a requirement for the female basketball team? Are small proportions of females claiming to be taller than they are? Another, perhaps more likely, explanation is that in the form students used to enter their heights, Female was the default sex and some males entered their heights, but forgot to change the sex variable. In any case, data visualization has helped discover a potential flaw in our data.\nRegarding the five smallest values, note that these values are:\n\nheights |&gt; filter(sex == \"Female\") |&gt; \n  top_n(5, desc(height)) |&gt;\n  pull(height)\n#&gt; [1] 51 53 55 52 52\n\nBecause these are reported heights, a possibility is that the student meant to enter 5'1\", 5'2\", 5'3\" or 5'5\"."
+    "text": "1.10 Stratification\nIn data analysis, we often divide observations into groups based on the values of one or more variables associated with those observations. For example, in the next section, we divide the height values into groups based on a sex variable: females and males. We call this procedure stratification and refer to the resulting groups as strata.\nStratification is common in data visualization because we are often interested in how the distribution of variables differs across different subgroups.\nUsing the histogram, density plots, and qqplots, we have become convinced that the male height data is well approximated with a normal distribution. In this case, we report back to ET a very succinct summary: male heights follow a normal distribution with an average of 69.3 inches and a SD of 3.6 inches. With this information, ET will have a good idea of what to expect when he meets our male students. However, to provide a complete picture we need to also provide a summary of the female heights.\nWe learned that boxplots are useful when we want to quickly compare two or more distributions. Here are the heights for men and women:\n\nheights |&gt; ggplot(aes(sex, height, fill = sex)) + geom_boxplot()\n\n\n\n\n\n\n\nThe plot immediately reveals that males are, on average, taller than females. The standard deviations appear to be similar. But does the normal approximation also work for the female height data collected by the survey? We expect that they will follow a normal distribution, just like males. However, exploratory plots reveal that the approximation is not as useful:\n\n\n\n\n\n\n\n\nWe see something we did not see for the males: the density plot has a second bump. Also, the qqplot shows that the highest points tend to be taller than expected by the normal distribution. Finally, we also see five points in the qqplot that suggest shorter than expected heights for a normal distribution. When reporting back to ET, we might need to provide a histogram rather than just the average and standard deviation for the female heights.\nWe have noticed what we didn’t expect to see. If we look at other female height distributions, we do find that they are well approximated with a normal distribution. So why are our female students different? Is our class a requirement for the female basketball team? Are small proportions of females claiming to be taller than they are? Another, perhaps more likely, explanation is that in the form students used to enter their heights, Female was the default sex and some males entered their heights, but forgot to change the sex variable. In any case, data visualization has helped discover a potential flaw in our data.\nRegarding the five smallest values, note that these are:\n\nheights |&gt; filter(sex == \"Female\") |&gt; \n  top_n(5, desc(height)) |&gt;\n  pull(height)\n#&gt; [1] 51 53 55 52 52\n\nBecause these are reported heights, a possibility is that the student meant to enter 5'1\", 5'2\", 5'3\" or 5'5\"."
   },
   {
     "objectID": "summaries/distributions.html#exercises",
     "href": "summaries/distributions.html#exercises",
     "title": "1  Distributions",
     "section": "\n1.11 Exercises",
-    "text": "1.11 Exercises\n1. In the murders dataset, the region is a categorical variable and the following is its distribution:\n\n\n\n\n\n\n\n\nTo the closest 5%, what proportion of the states are in the North Central region?\n2. Which of the following is true:\n\nThe graph above is a histogram.\nThe graph above shows only four numbers with a bar plot.\nCategories are not numbers, so it does not make sense to graph the distribution.\nThe colors, not the height of the bars, describe the distribution.\n\n3. The plot below shows the eCDF for male heights:\n\n\n\n\n\n\n\n\nBased on the plot, what percentage of males are shorter than 75 inches?\n\n100%\n95%\n80%\n72 inches\n\n4. To the closest inch, what height m has the property that 1/2 of the male students are taller than m and 1/2 are shorter?\n\n61 inches\n64 inches\n69 inches\n74 inches\n\n5. Here is an eCDF of the murder rates across states:\n\n\n\n\n\n\n\n\nKnowing that there are 51 states (counting DC) and based on this plot, how many states have murder rates larger than 10 per 100,000 people?\n\n1\n5\n10\n50\n\n6. Based on the eCDF above, which of the following statements are true:\n\nAbout half the states have murder rates above 7 per 100,000 and the other half below.\nMost states have murder rates below 2 per 100,000.\nAll the states have murder rates above 2 per 100,000.\nWith the exception of 4 states, the murder rates are below 5 per 100,000.\n\n7. Below is a histogram of male heights in our heights dataset:\n\n\n\n\n\n\n\n\nBased on this plot, how many males are between 63.5 and 65.5?\n\n10\n24\n47\n100\n\n8. About what percentage are shorter than 60 inches?\n\n1%\n10%\n25%\n50%\n\n9. Based on the density plot below, about what proportion of US states have populations larger than 10 million?\n\n\n\n\n\n\n\n\n\n0.02\n0.15\n0.50\n0.55\n\n10. Below are three density plots. Is it possible that they are from the same dataset?\n\n\n\n\n\n\n\n\nWhich of the following statements is true:\n\nIt is impossible that they are from the same dataset.\nThey are from the same dataset, but the plots are different due to code errors.\nThey are the same dataset, but the first and second plot undersmooth and the third oversmooths.\nThey are the same dataset, but the first is not in the log scale, the second undersmooths, and the third oversmooths.\n\n11. Define variables containing the heights of males and females like this:\n\nlibrary(dslabs)\nmale &lt;- heights$height[heights$sex == \"Male\"]\nfemale &lt;- heights$height[heights$sex == \"Female\"]\n\nHow many measurements do we have for each?\n12. Suppose we can’t make a plot and want to compare the distributions side by side. We can’t just list all the numbers. Instead, we will look at the percentiles. Create a five row table showing female_percentiles and male_percentiles with the 10th, 30th, 50th, 70th, & 90th percentiles for each sex. Then create a data frame with these two as columns.\n13. Study the following boxplots showing population sizes by country:\n\n\n\n\n\n\n\n\nWhich continent has the country with the biggest population size?\n14. What continent has the largest median population size?\n15. What is median population size for Africa to the nearest million?\n16. What proportion of countries in Europe have populations below 14 million?\n\n0.99\n0.75\n0.50\n0.25\n\n17. If we use a log transformation, which continent shown above has the largest interquartile range?\n18. Load the height data set and create a vector x with just the male heights:\n\nlibrary(dslabs)\nx &lt;- heights$height[heights$sex==\"Male\"]\n\nWhat proportion of the data is between 69 and 72 inches (taller than 69, but shorter or equal to 72)? Hint: use a logical operator and mean.\n19. Suppose all you know about the data is the average and the standard deviation. Use the normal approximation to estimate the proportion you just calculated. Hint: start by computing the average and standard deviation. Then use the pnorm function to predict the proportions.\n20. Notice that the approximation calculated in question nine is very close to the exact calculation in the first question. Now perform the same task for more extreme values. Compare the exact calculation and the normal approximation for the interval (79,81]. How many times bigger is the actual proportion than the approximation?\n21. Approximate the distribution of adult men in the world as normally distributed with an average of 69 inches and a standard deviation of 3 inches. Using this approximation, estimate the proportion of adult men that are 7 feet tall or taller, referred to as seven footers. Hint: use the pnorm function.\n22. There are about 1 billion men between the ages of 18 and 40 in the world. Use your answer to the previous question to estimate how many of these men (18-40 year olds) are seven feet tall or taller in the world?\n23. There are about 10 National Basketball Association (NBA) players that are 7 feet tall or higher. Using the answer to the previous two questions, what proportion of the world’s 18-to-40-year-old seven footers are in the NBA?\n14. Repeat the calculations performed in the previous question for Lebron James’ height: 6 feet 8 inches. There are about 150 players that are at least that tall.\n25. In answering the previous questions, we found that it is not at all rare for a seven footer to become an NBA player. What would be a fair critique of our calculations:\n\nPractice and talent are what make a great basketball player, not height.\nThe normal approximation is not appropriate for heights.\nAs seen in question 10, the normal approximation tends to underestimate the extreme values. It’s possible that there are more seven footers than we predicted.\nAs seen in question 10, the normal approximation tends to overestimate the extreme values. It’s possible that there are fewer seven footers than we predicted."
+    "text": "1.11 Exercises\n1. In the murders dataset, the region is a categorical variable and the following is its distribution:\n\n\n\n\n\n\n\n\nTo the closest 5%, what proportion of the states are in the North Central region?\n2. Which of the following is true:\n\nThe graph above is a histogram.\nThe graph above shows only four numbers with a bar plot.\nCategories are not numbers, so it does not make sense to graph the distribution.\nThe colors, not the height of the bars, describe the distribution.\n\n3. The plot below shows the eCDF for male heights:\n\n\n\n\n\n\n\n\nBased on the plot, what percentage of males are shorter than 75 inches?\n\n100%\n95%\n80%\n72 inches\n\n4. To the closest inch, what height m has the property that 1/2 of the male students are taller than m and 1/2 are shorter?\n\n61 inches\n64 inches\n69 inches\n74 inches\n\n5. Here is an eCDF of the murder rates across states:\n\n\n\n\n\n\n\n\nKnowing that there are 51 states (counting DC) and based on this plot, how many states have murder rates larger than 10 per 100,000 people?\n\n1\n5\n10\n50\n\n6. Based on the eCDF above, which of the following statements are true:\n\nAbout half the states have murder rates above 7 per 100,000 and the other half below.\nMost states have murder rates below 2 per 100,000.\nAll the states have murder rates above 2 per 100,000.\nWith the exception of 4 states, the murder rates are below 5 per 100,000.\n\n7. Below is a histogram of male heights in our heights dataset:\n\n\n\n\n\n\n\n\nBased on this plot, how many males are between 63.5 and 65.5?\n\n10\n24\n47\n100\n\n8. About what percentage are shorter than 60 inches?\n\n1%\n10%\n25%\n50%\n\n9. Based on the density plot below, about what proportion of US states have populations larger than 10 million?\n\n\n\n\n\n\n\n\n\n0.02\n0.15\n0.50\n0.55\n\n10. Below are three density plots. Is it possible that they are from the same dataset?\n\n\n\n\n\n\n\n\nWhich of the following statements is true:\n\nIt is impossible that they are from the same dataset.\nThey are from the same dataset, but the plots are different due to code errors.\nThey are the same dataset, but the first and second plot undersmooth and the third oversmooths.\nThey are the same dataset, but the first is not in the log scale, the second undersmooths, and the third oversmooths.\n\n11. Define variables containing the heights of males and females as follows:\n\nlibrary(dslabs)\nmale &lt;- heights$height[heights$sex == \"Male\"]\nfemale &lt;- heights$height[heights$sex == \"Female\"]\n\nHow many measurements do we have for each?\n12. Suppose we can’t make a plot and want to compare the distributions side by side. We can’t just list all the numbers. Instead, we will look at the percentiles. Create a five row table showing female_percentiles and male_percentiles with the 10th, 30th, 50th, 70th, & 90th percentiles for each sex. Then create a data frame with these two as columns.\n13. Study the following boxplots showing population sizes by country:\n\n\n\n\n\n\n\n\nWhich continent has the country with the biggest population size?\n14. Which continent has the largest median population size?\n15. What is median population size for Africa to the nearest million?\n16. What proportion of countries in Europe have populations below 14 million?\n\n0.99\n0.75\n0.50\n0.25\n\n17. If we use a log transformation, which continent shown above has the largest interquartile range?\n18. Load the height dataset and create a vector x with just the male heights:\n\nlibrary(dslabs)\nx &lt;- heights$height[heights$sex==\"Male\"]\n\nWhat proportion of the data is between 69 and 72 inches (taller than 69, but shorter or equal to 72)? Hint: use a logical operator and mean.\n19. Suppose all you know about the data is the average and the standard deviation. Use the normal approximation to estimate the proportion you just calculated. Hint: start by computing the average and standard deviation. Then use the pnorm function to predict the proportions.\n20. Notice that the approximation calculated in question nine is very close to the exact calculation in the first question. Now perform the same task for more extreme values. Compare the exact calculation and the normal approximation for the interval (79,81]. How many times larger is the actual proportion than the approximation?\n21. Approximate the distribution of adult men in the world as normally distributed with an average of 69 inches and a standard deviation of 3 inches. Using this approximation, estimate the proportion of adult men that are 7 feet tall or taller, referred to as seven-footers. Hint: use the pnorm function.\n22. There are about 1 billion men between the ages of 18 and 40 in the world. Use your answer to the previous question to estimate how many of these men (18-40 year olds) are seven feet tall or taller in the world?\n23. There are about 10 National Basketball Association (NBA) players that are 7 feet tall or higher. Using the answer to the previous two questions, what proportion of the world’s 18-to-40-year-old seven-footers are in the NBA?\n14. Repeat the calculations performed in the previous question for Lebron James’ height: 6 feet 8 inches. There are about 150 players that are at least that tall.\n25. In answering the previous questions, we found that it is not uncommon for a seven-footer to become an NBA player. What would be a fair critique of our calculations:\n\nPractice and talent are what make a great basketball player, not height.\nThe normal approximation is not appropriate for heights.\nAs seen in question 10, the normal approximation tends to underestimate the extreme values. It’s possible that there are more seven-footers than we predicted.\nAs seen in question 10, the normal approximation tends to overestimate the extreme values. It’s possible that there are fewer seven-footers than we predicted."
   },
   {
     "objectID": "summaries/robust-summaries.html#outliers",
     "href": "summaries/robust-summaries.html#outliers",
     "title": "2  Robust summaries",
     "section": "\n2.1 Outliers",
-    "text": "2.1 Outliers\nWe previously described how boxplots show outliers, but we did not provide a precise definition. Here we discuss outliers, approaches that can help detect them, and summaries that take into account their presence.\nOutliers are very common in real-world data anlysis. Data recording can be complex and it is common to observe data points generated in error. For example, an old monitoring device may read out nonsensical measurements before completely failing. Human error is also a source of outliers, in particular when data entry is done manually. An individual, for instance, may mistakenly enter their height in centimeters instead of inches or put the decimal in the wrong place.\nHow do we distinguish an outlier from measurements that were too big or too small simply due to expected variability? This is not always an easy question to answer, but we try to provide some guidance. Let’s begin with a simple case.\nSuppose a colleague is charged with collecting demography data for a group of males. The data report height in feet and are stored in the object:\n\nlibrary(dslabs)\nstr(outlier_example)\n#&gt;  num [1:500] 5.59 5.8 5.54 6.15 5.83 5.54 5.87 5.93 5.89 5.67 ...\n\nOur colleague uses the fact that heights are usually well approximated by a normal distribution and summarizes the data with average and standard deviation\n\nmean(outlier_example)\n#&gt; [1] 6.1\nsd(outlier_example)\n#&gt; [1] 7.8\n\nand writes a report on the interesting fact that this group of males is much taller than usual. The average height is over six feet tall! Using your data analysis skills, however, you notice something else that is unexpected: the standard deviation is over 7 feet. Adding and subtracting two standard deviations, you note that 95% of this population will have heights between -9.4892954, 21.6969354 feet, which does not make sense. A quick plot reveals the problem:\n\nboxplot(outlier_example)\n\n\n\n\n\n\n\n\n\nThere appears to be at least one value that is nonsensical, since we know that a height of 180 feet is impossible. The boxplot detects this point as an outlier."
+    "text": "2.1 Outliers\nWe previously described how boxplots show outliers, but we did not provide a precise definition. Here we discuss outliers, approaches that can help detect them, and summaries that take into account their presence.\nOutliers are very common in real-world data analysis. Data recording can be complex and it is common to observe data points generated in error. For example, an old monitoring device may read out nonsensical measurements before completely failing. Human error is also a source of outliers, in particular when data entry is done manually. For example, an individual may mistakenly enter their height in centimeters instead of inches or put the decimal in the wrong place.\nHow do we distinguish an outlier from measurements that were too big or too small simply due to expected variability? This is not always an easy question to answer, but we try to provide some guidance. Let’s begin with a simple case.\nSuppose a colleague is charged with collecting demography data for a group of males. The data report height in feet and are stored in the object:\n\nlibrary(dslabs)\nstr(outlier_example)\n#&gt;  num [1:500] 5.59 5.8 5.54 6.15 5.83 5.54 5.87 5.93 5.89 5.67 ...\n\nOur colleague uses the fact that heights are usually well approximated by a normal distribution and summarizes the data with average and standard deviation:\n\nmean(outlier_example)\n#&gt; [1] 6.1\nsd(outlier_example)\n#&gt; [1] 7.8\n\nand writes a report on the interesting fact that this group of males is much taller than usual. The average height is over six feet tall! Using your data analysis skills, however, you notice something else that is unexpected: the standard deviation is over 7 feet. Adding and subtracting two standard deviations, you note that 95% of this population will have heights between -9.4892954, 21.6969354 feet, which does not make sense. A quick plot reveals the problem:\n\nboxplot(outlier_example)\n\n\n\n\n\n\n\n\n\nThere appears to be at least one value that is nonsensical, since we know that a height of 180 feet is impossible. The boxplot detects this point as an outlier."
   },
   {
     "objectID": "summaries/robust-summaries.html#the-median",
@@ -137,7 +137,7 @@
     "href": "summaries/robust-summaries.html#the-inter-quartile-range-iqr",
     "title": "2  Robust summaries",
     "section": "\n2.3 The inter quartile range (IQR)",
-    "text": "2.3 The inter quartile range (IQR)\nThe box in boxplots is defined by the first and third quartile. These are meant to provide an idea of the variability in the data: 50% of the data is within this range. The difference between the 3rd and 1st quartile (or 75th and 25th percentiles) is referred to as the inter quartile range (IQR). As is the case with the median, this quantity will be robust to outliers as large values do not affect it. We can do some math to see that for normally distributed data, the IQR / 1.349 approximates the standard deviation of the data had an outlier not been present. We can see that this works well in our example since we get a standard deviation estimate of:\n\nIQR(outlier_example) / 1.349\n#&gt; [1] 0.245\n\nwhich is about 3 inches."
+    "text": "2.3 The inter quartile range (IQR)\nThe box in boxplots is defined by the first and third quartile. These are meant to provide an idea of the variability in the data: 50% of the data is within this range. The difference between the 3rd and 1st quartile (or 75th and 25th percentiles) is referred to as the inter quartile range (IQR). As is the case with the median, this quantity will be robust to outliers as large values do not affect it. We can do some math to see that for normally distributed data, the IQR / 1.349 approximates the standard deviation of the data had an outlier not been present. We can see that this works well in our example, since we get a standard deviation estimate of:\n\nIQR(outlier_example) / 1.349\n#&gt; [1] 0.245\n\nwhich is about 3 inches."
   },
   {
     "objectID": "summaries/robust-summaries.html#a-data-driven-definition-of-outliers",
@@ -158,91 +158,91 @@
     "href": "summaries/robust-summaries.html#exercises",
     "title": "2  Robust summaries",
     "section": "\n2.6 Exercises",
-    "text": "2.6 Exercises\nWe are going to use the HistData package. Load the height data set and create a vector x with just the male heights used in Galton’s data on the heights of parents and their children from his historic research on heredity.\n\nlibrary(HistData)\nx &lt;- Galton$child\n\n1. Compute the average and median of these data.\n2. Compute the median and median absolute deviation of these data.\n3. Now suppose Galton made a mistake when entering the first value and forgot to use the decimal point. You can imitate this error by typing:\n\nx_with_error &lt;- x\nx_with_error[1] &lt;- x_with_error[1]*10\n\nHow many inches does the average grow after this mistake?\n4. How many inches does the SD grow after this mistake?\n5. How many inches does the median grow after this mistake?\n6. How many inches does the MAD grow after this mistake?\n7. How could you use exploratory data analysis to detect that an error was made?\n\nSince it is only one value out of many, we will not be able to detect this.\nWe would see an obvious shift in the distribution.\nA boxplot, histogram, or qq-plot would reveal a clear outlier.\nA scatterplot would show high levels of measurement error.\n\n8. How much can the average accidentally grow with mistakes like this? Write a function called error_avg that takes a value k and returns the average of the vector x after the first entry changed to k. Show the results for k=10000 and k=-10000.\n9. Using the murders dataset in the dslabs package. Compute the murder rate for each state. Make a boxplot comparing the murder rates for each region of the United States.\n10. For the same dataset, compute the median and IQR murder rate for each region.\n11. Add a column to the reported_heights with the year the height was entered. You can use the year function in the lubridate package to extract the year from reported_heights$time_stamp). Change the height column from characters to numbers using parse_number from the readr package. Some of the heights will be converted to NA becuase they were incorrectly entetered and include characters, for example 165cm. These heights were supposed to be reported in inches, but many clearly did not. Convert any entry below 54 or above 72 to NA using the na_if function from dplyr. Once you dod this stratify by sex and year and report the percentage of incorrectly entered heights, represented by the NA.\n12. The heights we have been looking at are not the original heights reported by students. The original reported heights are also included in the dslabs package in the object reported_heights. Note that the height column in this data frame is a character and if we try to create a new column with the numeric version\n\nlibrary(tidyverse)  \nreported_heights &lt;- reported_heights |&gt;\n  mutate(original_heights = height, height = as.numeric(height))\n\nwe get a warnings about NAs. Take a look at the rows that result in NAs and describe why this is happeining. Others used centimeters and others were just trolling.\n13. Remove these entries the result in NAs after attempting to convert heights to numbers. Compute the mean, standard deviation, median, and MAD by sex. What do you notice?\n14. Generate boxplots summarizing the heights for males and females and describe what you see.\n15. Look at the largest 10 heights and provide a hypothesis for what you think is happening.\n16. Review all the nonsensical answers by looking at the data considered to be far out by Tukey and comment on the type of errors you see."
+    "text": "2.6 Exercises\nWe are going to use the HistData package. Load the height data set and create a vector x, consisting solely of the male heights used in Galton’s data on the heights of parents and their children, part of his historic research on heredity.\n\nlibrary(HistData)\nx &lt;- Galton$child\n\n1. Compute the average and median of these data.\n2. Compute the median and median absolute deviation of these data.\n3. Now suppose Galton made a mistake when entering the first value and forgot to use the decimal point. You can imitate this error by typing:\n\nx_with_error &lt;- x\nx_with_error[1] &lt;- x_with_error[1]*10\n\nHow many inches does the average grow after this mistake?\n4. How many inches does the SD grow after this mistake?\n5. How many inches does the median grow after this mistake?\n6. How many inches does the MAD grow after this mistake?\n7. How could you use exploratory data analysis to detect that an error was made?\n\nSince it is only one value out of many, we will not be able to detect this.\nWe would see an obvious shift in the distribution.\nA boxplot, histogram, or qqplot would reveal a clear outlier.\nA scatterplot would show high levels of measurement error.\n\n8. How much can the average accidentally grow with mistakes like this? Write a function called error_avg that takes a value k and returns the average of the vector x after the first entry changed to k. Show the results for k=10000 and k=-10000.\n9. Using the murders dataset in the dslabs package. Compute the murder rate for each state. Make a boxplot comparing the murder rates for each region of the United States.\n10. For the same dataset, compute the median and IQR murder rate for each region.\n11. Add a column to the reported_heights with the year the height was entered. You can use the year function in the lubridate package to extract the year from reported_heights$time_stamp). Change the height column from characters to numbers using parse_number from the readr package. Some of the heights will be converted to NA because they were incorrectly entered and include characters, for example 165cm. These heights were supposed to be reported in inches, but many clearly did not. Convert any entry below 54 or above 72 to NA using the na_if function from dplyr. Once you do this, stratify by sex and year and report the percentage of incorrectly entered heights, represented by the NA.\n12. The heights we have been looking at are not the original heights reported by students. The original reported heights are also included in the dslabs package in the object reported_heights. Note that the height column in this data frame is a character, and if we try to create a new column with the numeric version:\n\nlibrary(tidyverse)  \nreported_heights &lt;- reported_heights |&gt;\n  mutate(original_heights = height, height = as.numeric(height))\n\nwe get a warnings about NAs. Examine the rows that result in NAs and describe why this is happening.\n13. Remove the entries that result in NAs when attempting to convert heights to numbers. Compute the mean, standard deviation, median, and MAD by sex. What do you notice?\n14. Generate boxplots summarizing the heights for males and females and describe what you see.\n15. Look at the largest 10 heights and provide a hypothesis for what you think is happening.\n16. Review all the nonsensical answers by looking at the data considered to be far out by Tukey and comment on the type of errors you see."
   },
   {
     "objectID": "prob/intro-to-prob.html",
     "href": "prob/intro-to-prob.html",
     "title": "Probability",
     "section": "",
-    "text": "In games of chance, probability has a very intuitive definition. However, this is not the case in other contexts. Today probability theory is being used much more broadly with the word probability commonly used in everyday language. Google’s auto-complete of “What are the chances of” give us: “having twins”, “rain today”, “getting struck by lightning”, and “getting cancer”. One of the goals of this part of the book is to help us understand how probability is useful to understand and describe real-world events when performing data analysis. Probability theory is useful any time our data is affected by chance in some way. All of the other chapters in this book build upon probability theory. Knowledge of probability is therefore indispensable for most data analysis challenges.\nBecause knowing how to compute probabilities gives you an edge in games of chance, throughout history many smart individuals, including famous mathematicians such as Cardano, Fermat, and Pascal, spent time and energy thinking through the math of these games. As a result, Probability Theory was born. Probability continues to be highly useful in modern games of chance. For example, in poker, we can compute the probability of winning a hand based on the cards on the table. Also, casinos rely on probability theory to develop games that almost certainly guarantee a profit. We will use casino games to illustrate the basic concepts.\nThis part of the book discusses concepts that can be found in many comprehensive books on probability theory. These books delve into the mathematical theories and formulas behind probability.\nHowever, this book takes a different approach. Instead of diving into the mathematical theories, it uses R to demonstrate these concepts. This helps readers visualize and better understand the principles of probability in practical terms, as they can see the results and outcomes by running code.\nDespite this practical approach, the book does not immediately apply these probability concepts to real-world data. This connection between probability theory and real data will be made in a subsequent section or part of the book.\nIn other words, while you’re learning about probability now, it’ll be a bit longer before you see how these concepts relate directly to real datasets."
+    "text": "In games of chance, probability has a very intuitive definition. However, this is not the case in other contexts. Today, probability theory is being used much more broadly with the word probability commonly used in everyday language. Google’s auto-complete of “What are the chances of” give us: “having twins”, “rain today”, “getting struck by lightning”, and “getting cancer”. One of the goals of this section of the book is to help us in comprehending how probability is useful for understanding and describing real-world events when performing data analysis. Probability theory is useful whenever our data is affected by chance in some manner. All the other sections in this book build upon probability theory. A knowledge of probability is therefore indispensable for addressing most data analysis challenges.\nGiven that knowing how to compute probabilities gives strategic advantage in games of chance, many smart individuals throughout history, including famous mathematicians such as Cardano, Fermat, and Pascal, spent time and energy thinking through the math of these games. As a result, Probability Theory was born. Probability continues to be highly useful in modern games of chance. For example, in poker, we can compute the probability of winning a hand based on the cards on the table. Additionally, casinos depend on probability theory to develop games that almost certainly guarantee a profit. We will use casino games to illustrate the fundamental concepts.\nThis part of the book discusses concepts that can be found in many comprehensive books on probability theory. These books delve into the mathematical theories and formulas behind probability.\nThis book, however, takes a different approach. Instead of diving into the mathematical theories, it uses R to demonstrate these concepts. This helps readers visualize and better understand the principles of probability in practical terms, as they can see the results and outcomes by running code.\nDespite this practical approach, the book does not immediately apply these probability concepts to real-world data. This connection between probability theory and real data will be made in a subsequent section or part of the book.\nIn other words, while you’re learning about probability now, it’ll be a bit longer before you see how these concepts relate directly to real datasets."
   },
   {
     "objectID": "prob/discrete-probability.html#relative-frequency",
     "href": "prob/discrete-probability.html#relative-frequency",
     "title": "\n3  Discrete probability\n",
     "section": "\n3.1 Relative frequency",
-    "text": "3.1 Relative frequency\nThe word probability is used in everyday language. Answering questions about probability is often hard, if not impossible. Here we discuss a mathematical definition of probability that does permit us to give precise answers to certain questions.\nFor example, if I have 2 red beads and 3 blue beads inside an urn1 (most probability books use this archaic term, so we do too) and I pick one at random, what is the probability of picking a red one? Our intuition tells us that the answer is 2/5 or 40%. A precise definition can be given by noting that there are five possible outcomes of which two satisfy the condition necessary for the event “pick a red bead”. Since each of the five outcomes has the same chance of occurring, we conclude that the probability is .4 for red and .6 for blue.\nA more tangible way to think about the probability of an event is as the proportion of times the event occurs when we repeat the experiment an infinite number of times, independently, and under the same conditions."
+    "text": "3.1 Relative frequency\nThe term probability is used in everyday language. Yet answering questions about probability is often hard, if not impossible. In this section, we discuss a mathematical definition of probability that allows us to give precise answers to certain questions.\nFor example, if I have 2 red beads and 3 blue beads inside an urn1 (most probability books use this archaic term, so we do too) and I pick one at random, what is the probability of picking a red one? Our intuition tells us that the answer is 2/5 or 40%. A precise definition can be given by noting that there are five possible outcomes, of which two satisfy the condition necessary for the event “pick a red bead”. Since each of the five outcomes has an equal chance of occurring, we conclude that the probability is .4 for red and .6 for blue.\nA more tangible way to think about the probability of an event is as the proportion of times the event occurs when we repeat the experiment an infinite number of times, independently, and under the same conditions."
   },
   {
     "objectID": "prob/discrete-probability.html#notation",
     "href": "prob/discrete-probability.html#notation",
     "title": "\n3  Discrete probability\n",
     "section": "\n3.2 Notation",
-    "text": "3.2 Notation\nWe use the notation \\(\\mbox{Pr}(A)\\) to denote the probability of event \\(A\\) happening. We use the very general term event to refer to things that can happen when something occurs by chance. In our previous example, the event was “picking a red bead”. In a political poll in which we call 100 likely voters at random, an example of an event is “calling 48 Democrats and 52 Republicans”.\nIn data science applications, we will often deal with continuous variables. These events will often be things like “is this person taller than 6 feet”. In this case, we write events in a more mathematical form: \\(X \\geq 6\\). We will see more of these examples later. Here we focus on categorical data."
+    "text": "3.2 Notation\nWe use the notation \\(\\mbox{Pr}(A)\\) to denote the probability of event \\(A\\) occurring. We use the very general term event to refer to things that can happen when something occurs by chance. In our previous example, the event was “picking a red bead.” In a political poll, where we randomly phone 100 likely voters at random, an example of an event is “calling 48 Democrats and 52 Republicans.”\nIn data science applications, we often encounter continuous variables. These events will often be questions, such as “Is this person taller than 6 feet?” In these cases, we represent events in a more mathematical form: \\(X \\geq 6\\). We will see more of these examples later, but for now, we will focus on categorical data."
   },
   {
     "objectID": "prob/discrete-probability.html#probability-distributions",
     "href": "prob/discrete-probability.html#probability-distributions",
     "title": "\n3  Discrete probability\n",
     "section": "\n3.3 Probability distributions",
-    "text": "3.3 Probability distributions\nIf we know the relative frequency of the different categories, defining a distribution for categorical outcomes is relatively straightforward. We simply assign a probability to each category. In cases that can be thought of as beads in an urn, for each bead type, their proportion defines the distribution.\nIf we are randomly calling likely voters from a population that is 44% Democrat, 44% Republican, 10% undecided, and 2% Green Party, these proportions define the probability for each group. The probability distribution is:\n\n\nPr(picking a Republican)\n=\n0.44\n\n\nPr(picking a Democrat)\n=\n0.44\n\n\nPr(picking an undecided)\n=\n0.10\n\n\nPr(picking a Green)\n=\n0.02"
+    "text": "3.3 Probability distributions\nIf we know the relative frequency of the different categories, defining a distribution for categorical outcomes is relatively straightforward. We simply assign a probability to each category. In cases analogous to beads in an urn, for each bead type, their proportion defines the distribution.\nIf we are randomly calling likely voters from a population that is 44% Democrat, 44% Republican, 10% undecided, and 2% Green Party, these proportions define the probability for each group. The probability distribution is:\n\n\nPr(picking a Republican)\n=\n0.44\n\n\nPr(picking a Democrat)\n=\n0.44\n\n\nPr(picking an undecided)\n=\n0.10\n\n\nPr(picking a Green)\n=\n0.02"
   },
   {
     "objectID": "prob/discrete-probability.html#monte-carlo",
     "href": "prob/discrete-probability.html#monte-carlo",
     "title": "\n3  Discrete probability\n",
     "section": "\n3.4 Monte Carlo",
-    "text": "3.4 Monte Carlo\nComputers provide a way to actually perform the simple random experiment described above: pick a bead at random from a bag that contains three blue beads and two red ones. Random number generators permit us to mimic the process of picking at random.\nAn example is the sample function in R. We demonstrate its use in the code below. First, we use the function rep to generate the urn:\n\nbeads &lt;- rep(c(\"red\", \"blue\"), times = c(2,3))\nbeads\n#&gt; [1] \"red\"  \"red\"  \"blue\" \"blue\" \"blue\"\n\nand then use sample to pick a bead at random:\n\nsample(beads, 1)\n#&gt; [1] \"blue\"\n\nThis line of code produces one random outcome. We want to repeat this experiment an infinite number of times, but it is impossible to repeat forever. Instead, we repeat the experiment a large enough number of times to make the results practically equivalent to repeating forever. This is an example of a Monte Carlo simulation.\nMuch of what mathematical and theoretical statisticians study, which we do not cover in this book, relates to providing rigorous definitions of “practically equivalent” as well as studying how close a large number of experiments gets us to what happens in the limit. Later in this section, we provide a practical approach to deciding what is “large enough”.\nTo perform our first Monte Carlo simulation, we use the replicate function, which permits us to repeat the same task any number of times. Here, we repeat the random event \\(B =\\) 10,000 times:\n\nB &lt;- 10000\nevents &lt;- replicate(B, sample(beads, 1))\n\nWe can now see if our definition actually is in agreement with this Monte Carlo simulation approximation. We can use table to see the distribution:\n\ntab &lt;- table(events)\ntab\n#&gt; events\n#&gt; blue  red \n#&gt; 6028 3972\n\nand prop.table gives us the proportions:\n\nprop.table(tab)\n#&gt; events\n#&gt;  blue   red \n#&gt; 0.603 0.397\n\nThe numbers above are the estimated probabilities provided by this Monte Carlo simulation. Statistical theory, not covered here, tells us that as \\(B\\) gets larger, the estimates get closer to 3/5=.6 and 2/5=.4.\nAlthough this is a simple and not very useful example, we will use Monte Carlo simulations to estimate probabilities in cases in which it is harder to compute the exact ones. Before delving into more complex examples, we use simple ones to demonstrate the computing tools available in R.\n\n3.4.1 Setting the random seed\nBefore we continue, we will briefly explain the following important line of code:\n\nset.seed(1986) \n\nThroughout this book, we use random number generators. This implies that many of the results presented can actually change by chance, which then suggests that a frozen version of the book may show a different result than what you obtain when you try to code as shown in the book. This is actually fine since the results are random and change from time to time. However, if you want to ensure that results are exactly the same every time you run them, you can set R’s random number generation seed to a specific number. Above we set it to 1986. We want to avoid using the same seed everytime. A popular way to pick the seed is the year - month - day. For example, we picked 1986 on December 20, 2018: \\(2018 - 12 - 20 = 1986\\).\nYou can learn more about setting the seed by looking at the documentation:\n\n?set.seed\n\nIn the exercises, we may ask you to set the seed to assure that the results you obtain are exactly what we expect them to be.\n\n3.4.2 With and without replacement\nThe function sample has an argument that permits us to pick more than one element from the urn. However, by default, this selection occurs without replacement: after a bead is selected, it is not put back in the bag. Notice what happens when we ask to randomly select five beads:\n\nsample(beads, 5)\n#&gt; [1] \"red\"  \"blue\" \"blue\" \"blue\" \"red\"\nsample(beads, 5)\n#&gt; [1] \"red\"  \"red\"  \"blue\" \"blue\" \"blue\"\nsample(beads, 5)\n#&gt; [1] \"blue\" \"red\"  \"blue\" \"red\"  \"blue\"\n\nThis results in rearrangements that always have three blue and two red beads. If we ask that six beads be selected, we get an error:\n\nsample(beads, 6)\n\nError in sample.int(length(x), size, replace, prob) :    cannot take a sample larger than the population when 'replace = FALSE'\nHowever, the sample function can be used directly, without the use of replicate, to repeat the same experiment of picking 1 out of the 5 beads, continually, under the same conditions. To do this, we sample with replacement: return the bead back to the urn after selecting it. We can tell sample to do this by changing the replace argument, which defaults to FALSE, to replace = TRUE:\n\nevents &lt;- sample(beads, B, replace = TRUE)\nprop.table(table(events))\n#&gt; events\n#&gt;  blue   red \n#&gt; 0.602 0.398\n\nNot surprisingly, we get results very similar to those previously obtained with replicate."
+    "text": "3.4 Monte Carlo\nComputers provide a way to actually perform the simple random experiment described above: pick a bead at random from a bag that contains three blue beads and two red ones. Random number generators permit us to mimic the process of picking at random.\nAn example is the sample function in R. We demonstrate its use in the code below. First, we use the function rep to generate the urn:\n\nbeads &lt;- rep(c(\"red\", \"blue\"), times = c(2,3))\nbeads\n#&gt; [1] \"red\"  \"red\"  \"blue\" \"blue\" \"blue\"\n\nand then use sample to pick a bead at random:\n\nsample(beads, 1)\n#&gt; [1] \"red\"\n\nThis line of code produces a single random outcome. We want to repeat this experiment an infinite number of times, but it is impossible to repeat indefinitely. Instead, we repeat the experiment a large enough number of times to make the results practically equivalent to repeating indefinitely. This is an example of a Monte Carlo simulation.\nMuch of what mathematical and theoretical statisticians study, topics that we do not cover in this book, relates to providing rigorous definitions of “practically equivalent”. Additionally, they explore how close a large number of experiments brings us to what happens in the limit. Later in this section, we provide a practical approach to determining what is “large enough”.\nTo perform our first Monte Carlo simulation, we use the replicate function, which allows us to repeat the same task any number of times. Here, we repeat the random event \\(B =\\) 10,000 times:\n\nB &lt;- 10000\nevents &lt;- replicate(B, sample(beads, 1))\n\nWe can now verify if our definition actually is in agreement with this Monte Carlo simulation approximation. We use table to see the distribution:\n\ntab &lt;- table(events)\ntab\n#&gt; events\n#&gt; blue  red \n#&gt; 6027 3973\n\nand prop.table gives us the proportions:\n\nprop.table(tab)\n#&gt; events\n#&gt;  blue   red \n#&gt; 0.603 0.397\n\nThe numbers above represent the estimated probabilities obtained by this Monte Carlo simulation. Statistical theory, not covered here, tells us that as \\(B\\) gets larger, the estimates get closer to 3/5=.6 and 2/5=.4.\nThis is a simple and not very useful example, since we can easily compute the probabilities mathematically. Monte Carlo simulations are useful when it is hard, or impossible, to compute the exact probabilities mathematically. Before delving into more complex examples, we use simple ones to demonstrate the computing tools available in R.\n\n3.4.1 Setting the random seed\nBefore we continue, we will briefly explain the following important line of code:\n\nset.seed(1986) \n\nThroughout this book, we use random number generators. This implies that many of the results presented can potentially change by chance, indicating that a static version of the book may show a different result than what you obtain when following the code as presented. This is actually fine, given that the results are random and change over time. However, if you want to ensure that results are consistent with each run, you can set R’s random number generation seed to a specific number. Above we set it to 1986. We want to avoid using the same seed every time. A popular way to pick the seed is the year - month - day. For example, we chose 1986 on December 20, 2018: \\(2018 - 12 - 20 = 1986\\).\nYou can learn more about setting the seed by looking at the documentation:\n\n?set.seed\n\nIn the exercises, we may ask you to set the seed to assure that the results you obtain are exactly what we expect them to be.\n\n3.4.2 With and without replacement\nThe function sample has an argument that allows us to pick more than one element from the urn. However, by default, this selection occurs without replacement: after a bead is selected, it is not returned to the bag. Notice what happens when we ask to randomly select five beads:\n\nsample(beads, 5)\n#&gt; [1] \"red\"  \"blue\" \"blue\" \"blue\" \"red\"\nsample(beads, 5)\n#&gt; [1] \"red\"  \"red\"  \"blue\" \"blue\" \"blue\"\nsample(beads, 5)\n#&gt; [1] \"blue\" \"red\"  \"blue\" \"red\"  \"blue\"\n\nThis results in rearrangements that consistently comprise three blue and two red beads. If we ask that six beads be selected, we get an error:\n\nsample(beads, 6)\n\nError in sample.int(length(x), size, replace, prob) :    cannot take a sample larger than the population when 'replace = FALSE'\nHowever, the sample function can be used directly, without the use of replicate, to repeat the same experiment of picking 1 out of the 5 beads, continually, under the same conditions. To do this, we sample with replacement: return the bead back to the urn after selecting it. We can tell sample to do this by changing the replace argument, which defaults to FALSE, to replace = TRUE:\n\nevents &lt;- sample(beads, B, replace = TRUE)\nprop.table(table(events))\n#&gt; events\n#&gt;  blue   red \n#&gt; 0.602 0.398\n\nNot surprisingly, we get results very similar to those previously obtained with replicate."
   },
   {
     "objectID": "prob/discrete-probability.html#independence",
     "href": "prob/discrete-probability.html#independence",
     "title": "\n3  Discrete probability\n",
     "section": "\n3.5 Independence",
-    "text": "3.5 Independence\nWe say two events are independent if the outcome of one does not affect the other. The classic example is coin tosses. Every time we toss a fair coin, the probability of seeing heads is 1/2 regardless of what previous tosses have revealed. The same is true when we pick beads from an urn with replacement. In the example above, the probability of red is 0.40 regardless of previous draws.\nMany examples of events that are not independent come from card games. When we deal the first card, the probability of getting a King is 1/13 since there are thirteen possibilities: Ace, Deuce, Three, \\(\\dots\\), Ten, Jack, Queen, King, and Ace. Now if we deal a King for the first card, and don’t replace it into the deck, the probabilities of a second card being a King is less because there are only three Kings left: the probability is 3 out of 51. These events are therefore not independent: the first outcome affected the next one.\nTo see an extreme case of non-independent events, consider our example of drawing five beads at random without replacement:\n\nx &lt;- sample(beads, 5)\n\nIf you have to guess the color of the first bead, you will predict blue since blue has a 60% chance. But if I show you the result of the last four outcomes:\n\nx[2:5]\n#&gt; [1] \"blue\" \"blue\" \"blue\" \"red\"\n\nwould you still guess blue? Of course not. Now you know that the probability of red is 1 since the only bead left is red. The events are not independent, so the probabilities change."
+    "text": "3.5 Independence\nWe say two events are independent if the outcome of one does not affect the other. The classic example is coin tosses. Every time we toss a fair coin, the probability of seeing heads is 1/2, regardless of what previous tosses have revealed. The same is true when we pick beads from an urn with replacement. In the example above, the probability of red is 0.40 regardless of previous draws.\nMany examples of events that are not independent come from card games. When we deal the first card, the probability of getting a King is 1/13 since there are thirteen possibilities: Ace, Deuce, Three, \\(\\dots\\), Ten, Jack, Queen, King, and Ace. Now, if we deal a King for the first card and do not replace it in the deck, the probability of a second card being a King decreases because there are only three Kings left. The probability is 3 out of 51. These events are therefore not independent: the first outcome affected the next one.\nTo see an extreme case of non-independent events, consider our example of drawing five beads at random without replacement:\n\nx &lt;- sample(beads, 5)\n\nIf you have to guess the color of the first bead, you will predict blue since blue has a 60% chance. However, if I show you the result of the last four outcomes:\n\nx[2:5]\n#&gt; [1] \"blue\" \"blue\" \"blue\" \"red\"\n\nwould you still guess blue? Of course not. Now, you know that the probability of red, as the the only bead left is red. The events are not independent, so the probabilities change."
   },
   {
     "objectID": "prob/discrete-probability.html#conditional-probabilities",
     "href": "prob/discrete-probability.html#conditional-probabilities",
     "title": "\n3  Discrete probability\n",
     "section": "\n3.6 Conditional probabilities",
-    "text": "3.6 Conditional probabilities\nWhen events are not independent, conditional probabilities are useful. We already saw an example of a conditional probability: we computed the probability that a second dealt card is a King given that the first was a King. In probability, we use the following notation:\n\\[\n\\mbox{Pr}(\\mbox{Card 2 is a king} \\mid \\mbox{Card 1 is a king}) = 3/51\n\\]\nWe use the \\(\\mid\\) as shorthand for “given that” or “conditional on”.\nWhen two events, say \\(A\\) and \\(B\\), are independent, we have:\n\\[\n\\mbox{Pr}(A \\mid B) = \\mbox{Pr}(A)\n\\]\nThis is the mathematical way of saying: the fact that \\(B\\) happened does not affect the probability of \\(A\\) happening. In fact, this can be considered the mathematical definition of independence."
+    "text": "3.6 Conditional probabilities\nWhen events are not independent, conditional probabilities are useful. We have already demonstrated an example of a conditional probability: we computed the probability that a second dealt card is a King given that the first was a King. In probability, we use the following notation:\n\\[\n\\mbox{Pr}(\\mbox{Card 2 is a king} \\mid \\mbox{Card 1 is a king}) = 3/51\n\\]\nWe use the \\(\\mid\\) as shorthand for “given that” or “conditional on.”\nWhen two events, say \\(A\\) and \\(B\\), are independent, we have:\n\\[\n\\mbox{Pr}(A \\mid B) = \\mbox{Pr}(A)\n\\]\nThis is the mathematical way of saying: the fact that \\(B\\) happened does not affect the probability of \\(A\\) happening. In fact, this can be considered the mathematical definition of independence."
   },
   {
     "objectID": "prob/discrete-probability.html#addition-and-multiplication-rules",
     "href": "prob/discrete-probability.html#addition-and-multiplication-rules",
     "title": "\n3  Discrete probability\n",
     "section": "\n3.7 Addition and multiplication rules",
-    "text": "3.7 Addition and multiplication rules\n\n3.7.1 Multiplication rule\nIf we want to know the probability of two events, say \\(A\\) and \\(B\\), occurring, we can use the multiplication rule:\n\\[\n\\mbox{Pr}(A \\mbox{ and } B) = \\mbox{Pr}(A)\\mbox{Pr}(B \\mid A)\n\\] Let’s use Blackjack as an example. In Blackjack, you are assigned two random cards. After you see what you have, you can ask for more. The goal is to get closer to 21 than the dealer, without going over. Face cards are worth 10 points and Aces are worth 11 or 1 (you choose).\nSo, in a Blackjack game, to calculate the chances of getting a 21 by drawing an Ace and then a face card, we compute the probability of the first being an Ace and multiply by the probability of drawing a face card or a 10 given that the first was an Ace: \\(1/13 \\times 16/51 \\approx 0.025\\)\nThe multiplication rule also applies to more than two events. We can use induction to expand for more events:\n\\[\n\\mbox{Pr}(A \\mbox{ and } B \\mbox{ and } C) = \\mbox{Pr}(A)\\mbox{Pr}(B \\mid A)\\mbox{Pr}(C \\mid A \\mbox{ and } B)\n\\]\n\n3.7.2 Multiplication rule under independence\nWhen we have independent events, then the multiplication rule becomes simpler:\n\\[\n\\mbox{Pr}(A \\mbox{ and } B \\mbox{ and } C) = \\mbox{Pr}(A)\\mbox{Pr}(B)\\mbox{Pr}(C)\n\\]\nBut we have to be very careful before using this since assuming independence can result in very different and incorrect probability calculations when we don’t actually have independence.\nAs an example, imagine a court case in which the suspect was described as having a mustache and a beard. The defendant has a mustache and a beard and the prosecution brings in an “expert” to testify that 1/10 men have beards and 1/5 have mustaches, so using the multiplication rule we conclude that only \\(1/10 \\times 1/5\\) or 0.02 have both.\nBut to multiply like this we need to assume independence! Say the conditional probability of a man having a mustache conditional on him having a beard is .95. So the correct calculation probability is much higher: \\(1/10 \\times 95/100 = 0.095\\).\nThe multiplication rule also gives us a general formula for computing conditional probabilities:\n\\[\n\\mbox{Pr}(B \\mid A) = \\frac{\\mbox{Pr}(A \\mbox{ and } B)}{ \\mbox{Pr}(A)}\n\\]\nTo illustrate how we use these formulas and concepts in practice, we will use several examples related to card games.\n\n3.7.3 Addition rule\nThe addition rule tells us that:\n\\[\n\\mbox{Pr}(A \\mbox{ or } B) = \\mbox{Pr}(A) + \\mbox{Pr}(B) - \\mbox{Pr}(A \\mbox{ and } B)\n\\]\nThis rule is intuitive: think of a Venn diagram. If we simply add the probabilities, we count the intersection twice so we need to substract one instance."
+    "text": "3.7 Addition and multiplication rules\n\n3.7.1 Multiplication rule\nIf we want to determine the probability of two events, say \\(A\\) and \\(B\\), occurring, we can use the multiplication rule:\n\\[\n\\mbox{Pr}(A \\mbox{ and } B) = \\mbox{Pr}(A)\\mbox{Pr}(B \\mid A)\n\\] Let’s use Blackjack as an example. In Blackjack, you are assigned two random cards. After you see what you have, you can ask for more. The goal is to get closer to 21 than the dealer, without going over. Face cards are worth 10 points and Aces are worth 11 or 1 (you choose).\nIn a Blackjack game, to calculate the chances of obtaining a 21 by drawing an Ace and then a face card, we compute the probability of the first card being an Ace and multiply it by the probability of drawing a face card or a 10, given that the first card was an Ace: \\(1/13 \\times 16/51 \\approx 0.025\\)\nThe multiplication rule also applies to more than two events. We can use induction to expand for more events:\n\\[\n\\mbox{Pr}(A \\mbox{ and } B \\mbox{ and } C) = \\mbox{Pr}(A)\\mbox{Pr}(B \\mid A)\\mbox{Pr}(C \\mid A \\mbox{ and } B)\n\\]\n\n3.7.2 Multiplication rule under independence\nWhen dealing with independent events, the multiplication rule becomes simpler:\n\\[\n\\mbox{Pr}(A \\mbox{ and } B \\mbox{ and } C) = \\mbox{Pr}(A)\\mbox{Pr}(B)\\mbox{Pr}(C)\n\\]\nHowever, we have to be very careful before using this version of the multiplication rule, since assuming independence can result in very different and incorrect probability calculations when events are not actually independent.\nAs an example, imagine a court case in which the suspect was described as having a mustache and a beard. The defendant has a mustache and a beard, and the prosecution brings in an “expert” to testify that 1/10 men have beards and 1/5 have mustaches. Using the multiplication rule, we therefore conclude that only \\(1/10 \\times 1/5\\) or 0.02 have both.\nBut, to multiply like this, we need to assume independence! Let’s say the conditional probability of a man having a mustache, conditional on him having a beard, is .95. Then, the correct calculation probability is much higher: \\(1/10 \\times 95/100 = 0.095\\).\nThe multiplication rule also gives us a general formula for computing conditional probabilities:\n\\[\n\\mbox{Pr}(B \\mid A) = \\frac{\\mbox{Pr}(A \\mbox{ and } B)}{ \\mbox{Pr}(A)}\n\\]\nTo illustrate how we use these formulas and concepts in practice, we will use several examples related to card games.\n\n3.7.3 Addition rule\nThe addition rule tells us that:\n\\[\n\\mbox{Pr}(A \\mbox{ or } B) = \\mbox{Pr}(A) + \\mbox{Pr}(B) - \\mbox{Pr}(A \\mbox{ and } B)\n\\]\nThis rule is intuitive; consider a Venn diagram. If we simply add the probabilities, we count the intersection twice, so we need to subtract one instance."
   },
   {
     "objectID": "prob/discrete-probability.html#combinations-and-permutations",
     "href": "prob/discrete-probability.html#combinations-and-permutations",
     "title": "\n3  Discrete probability\n",
     "section": "\n3.8 Combinations and permutations",
-    "text": "3.8 Combinations and permutations\nIn our very first example, we imagined an urn with five beads. As a reminder, to compute the probability distribution of one draw, we simply listed out all the possibilities. There were 5 and so then, for each event, we counted how many of these possibilities were associated with the event. The resulting probability of choosing a blue bead is 3/5 because out of the five possible outcomes, three were blue.\nFor more complicated cases, the computations are not as straightforward. For instance, what is the probability that if I draw five cards without replacement, I get all cards of the same suit, what is known as a “flush” in poker? In a discrete probability course you learn theory on how to make these computations. Here we focus on how to use R code to compute the answers.\nFirst, let’s construct a deck of cards. For this, we will use the expand.grid and paste functions. We use paste to create strings by joining smaller strings. To do this, we take the number and suit of a card and create the card name like this:\n\nnumber &lt;- \"Three\"\nsuit &lt;- \"Hearts\"\npaste(number, suit)\n#&gt; [1] \"Three Hearts\"\n\npaste also works on pairs of vectors performing the operation element-wise:\n\npaste(letters[1:5], as.character(1:5))\n#&gt; [1] \"a 1\" \"b 2\" \"c 3\" \"d 4\" \"e 5\"\n\nThe function expand.grid gives us all the combinations of entries of two vectors. For example, if you have blue and black pants and white, grey, and plaid shirts, all your combinations are:\n\nexpand.grid(pants = c(\"blue\", \"black\"), shirt = c(\"white\", \"grey\", \"plaid\"))\n#&gt;   pants shirt\n#&gt; 1  blue white\n#&gt; 2 black white\n#&gt; 3  blue  grey\n#&gt; 4 black  grey\n#&gt; 5  blue plaid\n#&gt; 6 black plaid\n\nHere is how we generate a deck of cards:\n\nsuits &lt;- c(\"Diamonds\", \"Clubs\", \"Hearts\", \"Spades\")\nnumbers &lt;- c(\"Ace\", \"Deuce\", \"Three\", \"Four\", \"Five\", \"Six\", \"Seven\", \n             \"Eight\", \"Nine\", \"Ten\", \"Jack\", \"Queen\", \"King\")\ndeck &lt;- expand.grid(number = numbers, suit = suits)\ndeck &lt;- paste(deck$number, deck$suit)\n\nWith the deck constructed, we can double check that the probability of a King in the first card is 1/13 by computing the proportion of possible outcomes that satisfy our condition:\n\nkings &lt;- paste(\"King\", suits)\nmean(deck %in% kings)\n#&gt; [1] 0.0769\n\nNow, how about the conditional probability of the second card being a King given that the first was a King? Earlier, we deduced that if one King is already out of the deck and there are 51 left, then this probability is 3/51. Let’s confirm by listing out all possible outcomes.\nTo do this, we can use the permutations function from the gtools package. For any list of size n, this function computes all the different combinations we can get when we select r items. Here are all the ways we can choose two numbers from a list consisting of 1,2,3:\n\nlibrary(gtools)\npermutations(3, 2)\n#&gt;      [,1] [,2]\n#&gt; [1,]    1    2\n#&gt; [2,]    1    3\n#&gt; [3,]    2    1\n#&gt; [4,]    2    3\n#&gt; [5,]    3    1\n#&gt; [6,]    3    2\n\nNotice that the order matters here: 3,1 is different than 1,3. Also, note that (1,1), (2,2), and (3,3) do not appear because once we pick a number, it can’t appear again.\nOptionally, we can add a vector. If you want to see five random seven digit phone numbers out of all possible phone numbers (without repeats), you can type:\n\nall_phone_numbers &lt;- permutations(10, 7, v = 0:9)\nn &lt;- nrow(all_phone_numbers)\nindex &lt;- sample(n, 5)\nall_phone_numbers[index,]\n#&gt;      [,1] [,2] [,3] [,4] [,5] [,6] [,7]\n#&gt; [1,]    1    3    8    0    6    7    5\n#&gt; [2,]    2    9    1    6    4    8    0\n#&gt; [3,]    5    1    6    0    9    8    2\n#&gt; [4,]    7    4    6    0    2    8    1\n#&gt; [5,]    4    6    5    9    2    8    0\n\nInstead of using the numbers 1 through 10, the default, it uses what we provided through v: the digits 0 through 9.\nTo compute all possible ways we can choose two cards when the order matters, we type:\n\nhands &lt;- permutations(52, 2, v = deck)\n\nThis is a matrix with two columns and 2652 rows. With a matrix we can get the first and second cards like this:\n\nfirst_card &lt;- hands[,1]\nsecond_card &lt;- hands[,2]\n\nNow the cases for which the first hand was a King can be computed like this:\n\nkings &lt;- paste(\"King\", suits)\nsum(first_card %in% kings)\n#&gt; [1] 204\n\nTo get the conditional probability, we compute what fraction of these have a King in the second card:\n\nsum(first_card %in% kings & second_card %in% kings) / \n  sum(first_card %in% kings)\n#&gt; [1] 0.0588\n\nwhich is exactly 3/51, as we had already deduced. Notice that the code above is equivalent to:\n\nmean(first_card %in% kings & second_card %in% kings) / \n  mean(first_card %in% kings)\n#&gt; [1] 0.0588\n\nwhich uses mean instead of sum and is an R version of:\n\\[\n\\frac{\\mbox{Pr}(A \\mbox{ and } B)}{ \\mbox{Pr}(A)}\n\\]\nHow about if the order doesn’t matter? For example, in Blackjack if you get an Ace and a face card in the first draw, it is called a Natural 21 and you win automatically. If we wanted to compute the probability of this happening, we would enumerate the combinations, not the permutations, since the order does not matter.\n\ncombinations(3,2)\n#&gt;      [,1] [,2]\n#&gt; [1,]    1    2\n#&gt; [2,]    1    3\n#&gt; [3,]    2    3\n\nIn the second line, the outcome does not include (2,1) because (1,2) already was enumerated. The same applies to (3,1) and (3,2).\nSo to compute the probability of a Natural 21 in Blackjack, we can do this:\n\naces &lt;- paste(\"Ace\", suits)\n\nfacecard &lt;- c(\"King\", \"Queen\", \"Jack\", \"Ten\")\nfacecard &lt;- expand.grid(number = facecard, suit = suits)\nfacecard &lt;- paste(facecard$number, facecard$suit)\n\nhands &lt;- combinations(52, 2, v = deck)\nmean(hands[,1] %in% aces & hands[,2] %in% facecard)\n#&gt; [1] 0.0483\n\nIn the last line, we assume the Ace comes first. This is only because we know the way combination enumerates possibilities and it will list this case first. But to be safe, we could have written this and produced the same answer:\n\nmean((hands[,1] %in% aces & hands[,2] %in% facecard) |\n       (hands[,2] %in% aces & hands[,1] %in% facecard))\n#&gt; [1] 0.0483\n\n\n3.8.1 Monte Carlo example\nInstead of using combinations to deduce the exact probability of a Natural 21, we can use a Monte Carlo to estimate this probability. In this case, we draw two cards over and over and keep track of how many 21s we get. We can use the function sample to draw two cards without replacements:\n\nhand &lt;- sample(deck, 2)\nhand\n#&gt; [1] \"Queen Clubs\"  \"Seven Spades\"\n\nAnd then check if one card is an Ace and the other a face card or a 10. Going forward, we include 10 when we say face card. Now we need to check both possibilities:\n\n(hands[1] %in% aces & hands[2] %in% facecard) | \n  (hands[2] %in% aces & hands[1] %in% facecard)\n#&gt; [1] FALSE\n\nIf we repeat this 10,000 times, we get a very good approximation of the probability of a Natural 21.\nLet’s start by writing a function that draws a hand and returns TRUE if we get a 21. The function does not need any arguments because it uses objects defined in the global environment.\n\nblackjack &lt;- function(){\n   hand &lt;- sample(deck, 2)\n  (hand[1] %in% aces & hand[2] %in% facecard) | \n    (hand[2] %in% aces & hand[1] %in% facecard)\n}\n\nHere we do have to check both possibilities: Ace first or Ace second because we are not using the combinations function. The function returns TRUE if we get a 21 and FALSE otherwise:\n\nblackjack()\n#&gt; [1] FALSE\n\nNow we can play this game, say, 10,000 times:\n\nB &lt;- 10000\nresults &lt;- replicate(B, blackjack())\nmean(results)\n#&gt; [1] 0.0475"
+    "text": "3.8 Combinations and permutations\nIn our very first example, we imagined an urn with five beads. As a reminder, to compute the probability distribution of a single draw, we simply listed out all the possibilities, which amounted to 5. Subsequently, for each event, we counted how many of these possibilities were associated with the event. The resulting probability of choosing a blue bead is 3/5, as three of the five possible outcomes were blue.\nFor more complicated cases, the computations are not as straightforward. For instance, what is the probability that, if I draw five cards without replacement, I get all cards of the same suit, which is known as a “flush” in poker? In a discrete probability course, you learn theory on how to make these computations. Here, we focus on how to use R code to compute the answers.\nFirst, let’s construct a deck of cards. For this, we will use the expand.grid and paste functions. We use paste to create strings by joining smaller strings. To do this, we take the number and suit of a card, and create the card name like this:\n\nnumber &lt;- \"Three\"\nsuit &lt;- \"Hearts\"\npaste(number, suit)\n#&gt; [1] \"Three Hearts\"\n\npaste also works on pairs of vectors performing the operation element-wise:\n\npaste(letters[1:5], as.character(1:5))\n#&gt; [1] \"a 1\" \"b 2\" \"c 3\" \"d 4\" \"e 5\"\n\nThe function expand.grid gives us all the combinations of entries of two vectors. For example, if you have blue and black pants and white, grey, and plaid shirts, all your combinations are:\n\nexpand.grid(pants = c(\"blue\", \"black\"), shirt = c(\"white\", \"grey\", \"plaid\"))\n#&gt;   pants shirt\n#&gt; 1  blue white\n#&gt; 2 black white\n#&gt; 3  blue  grey\n#&gt; 4 black  grey\n#&gt; 5  blue plaid\n#&gt; 6 black plaid\n\nHere is how we generate a deck of cards:\n\nsuits &lt;- c(\"Diamonds\", \"Clubs\", \"Hearts\", \"Spades\")\nnumbers &lt;- c(\"Ace\", \"Deuce\", \"Three\", \"Four\", \"Five\", \"Six\", \"Seven\", \n             \"Eight\", \"Nine\", \"Ten\", \"Jack\", \"Queen\", \"King\")\ndeck &lt;- expand.grid(number = numbers, suit = suits)\ndeck &lt;- paste(deck$number, deck$suit)\n\nWith the deck constructed, we can double-check that the probability of a King as the first card is 1/13 by computing the proportion of possible outcomes that satisfy our condition:\n\nkings &lt;- paste(\"King\", suits)\nmean(deck %in% kings)\n#&gt; [1] 0.0769\n\nNow, what about the conditional probability of the second card being a King, given that the first card was a King? Earlier, we deduced that if one King is already drawn from the deck, leaving 51 cards, then this probability is 3/51. Let’s confirm by listing out all possible outcomes.\nTo do this, we can use the permutations function from the gtools package. For any list of size n, this function computes all the different combinations we can obtain when we select r items. Here are all the ways we can choose two numbers from a list consisting of 1,2,3:\n\nlibrary(gtools)\npermutations(3, 2)\n#&gt;      [,1] [,2]\n#&gt; [1,]    1    2\n#&gt; [2,]    1    3\n#&gt; [3,]    2    1\n#&gt; [4,]    2    3\n#&gt; [5,]    3    1\n#&gt; [6,]    3    2\n\nObserve that the order matters here: 3,1 is different than 1,3. Also, note that (1,1), (2,2), and (3,3) do not appear because once we pick a number, it can’t appear again.\nOptionally, we can add a vector. If you want to see five random seven digit phone numbers out of all possible phone numbers (without repeats), you can type:\n\nall_phone_numbers &lt;- permutations(10, 7, v = 0:9)\nn &lt;- nrow(all_phone_numbers)\nindex &lt;- sample(n, 5)\nall_phone_numbers[index,]\n#&gt;      [,1] [,2] [,3] [,4] [,5] [,6] [,7]\n#&gt; [1,]    1    3    8    0    6    7    5\n#&gt; [2,]    2    9    1    6    4    8    0\n#&gt; [3,]    5    1    6    0    9    8    2\n#&gt; [4,]    7    4    6    0    2    8    1\n#&gt; [5,]    4    6    5    9    2    8    0\n\nInstead of using the numbers 1 through 10, the default, it uses what we provided through v: the digits 0 through 9.\nTo compute all possible ways we can choose two cards when the order matters, we type:\n\nhands &lt;- permutations(52, 2, v = deck)\n\nThis is a matrix with two columns and 2652 rows. With a matrix, we can obtain the first and second cards like this:\n\nfirst_card &lt;- hands[,1]\nsecond_card &lt;- hands[,2]\n\nNow, the cases for which the first hand was a King can be computed as follows:\n\nkings &lt;- paste(\"King\", suits)\nsum(first_card %in% kings)\n#&gt; [1] 204\n\nTo get the conditional probability, we compute what fraction of these have a King as the second card:\n\nsum(first_card %in% kings & second_card %in% kings) / \n  sum(first_card %in% kings)\n#&gt; [1] 0.0588\n\nwhich is exactly 3/51, as we had already deduced. Notice that the code above is equivalent to:\n\nmean(first_card %in% kings & second_card %in% kings) / \n  mean(first_card %in% kings)\n#&gt; [1] 0.0588\n\nwhich uses mean instead of sum and is an R version of:\n\\[\n\\frac{\\mbox{Pr}(A \\mbox{ and } B)}{ \\mbox{Pr}(A)}\n\\]\nWhat about if the order does not matter? For example, in Blackjack, if you obtain an Ace and a face card in the first draw, it is called a Natural 21, and you win automatically. If we wanted to compute the probability of this happening, we would enumerate the combinations, not the permutations, since the order does not matter.\n\ncombinations(3,2)\n#&gt;      [,1] [,2]\n#&gt; [1,]    1    2\n#&gt; [2,]    1    3\n#&gt; [3,]    2    3\n\nIn the second line, the outcome does not include (2,1) because (1,2) already was enumerated. The same applies to (3,1) and (3,2).\nSo to compute the probability of a Natural 21 in Blackjack, we can do this:\n\naces &lt;- paste(\"Ace\", suits)\n\nfacecard &lt;- c(\"King\", \"Queen\", \"Jack\", \"Ten\")\nfacecard &lt;- expand.grid(number = facecard, suit = suits)\nfacecard &lt;- paste(facecard$number, facecard$suit)\n\nhands &lt;- combinations(52, 2, v = deck)\nmean(hands[,1] %in% aces & hands[,2] %in% facecard)\n#&gt; [1] 0.0483\n\nIn the last line, we assume the Ace comes first. This assumption is made based on our knowledge of how combination enumerates possibilities, and it will list this case first. However, to be safe, we could have written this and produced the same answer:\n\nmean((hands[,1] %in% aces & hands[,2] %in% facecard) |\n       (hands[,2] %in% aces & hands[,1] %in% facecard))\n#&gt; [1] 0.0483\n\n\n3.8.1 Monte Carlo example\nInstead of using combinations to deduce the exact probability of a Natural 21, we can use a Monte Carlo to estimate this probability. In this case, we draw two cards over and over and keep track of how many 21s we obtain. We can use the function sample to draw two cards without replacement:\n\nhand &lt;- sample(deck, 2)\nhand\n#&gt; [1] \"Queen Clubs\"  \"Seven Spades\"\n\nand then check if one card is an Ace and the other is a face card or a 10. Going forward, we include 10 when we refer to a face card. Now, we need to check both possibilities:\n\n(hands[1] %in% aces & hands[2] %in% facecard) | \n  (hands[2] %in% aces & hands[1] %in% facecard)\n#&gt; [1] FALSE\n\nIf we repeat this 10,000 times, we get a very good approximation of the probability of a Natural 21.\nLet’s start by writing a function that draws a hand and returns TRUE if we get a 21. The function does not need any arguments because it uses objects defined in the global environment.\n\nblackjack &lt;- function(){\n   hand &lt;- sample(deck, 2)\n  (hand[1] %in% aces & hand[2] %in% facecard) | \n    (hand[2] %in% aces & hand[1] %in% facecard)\n}\n\nHere. we do have to check both possibilities: Ace first or Ace second, because we are not using the combinations function. The function returns TRUE if we get a 21 and FALSE otherwise:\n\nblackjack()\n#&gt; [1] FALSE\n\nNow we can play this game, say, 10,000 times:\n\nB &lt;- 10000\nresults &lt;- replicate(B, blackjack())\nmean(results)\n#&gt; [1] 0.0475"
   },
   {
     "objectID": "prob/discrete-probability.html#examples",
     "href": "prob/discrete-probability.html#examples",
     "title": "\n3  Discrete probability\n",
     "section": "\n3.9 Examples",
-    "text": "3.9 Examples\nIn this section, we describe two discrete probability popular examples: the Monty Hall problem and the birthday problem. We use R to help illustrate the mathematical concepts.\n\n3.9.1 Monty Hall problem\nIn the 1970s, there was a game show called “Let’s Make a Deal” and Monty Hall was the host. At some point in the game, contestants were asked to pick one of three doors. Behind one door there was a prize. The other doors had a goat behind them to show the contestant they had lost. After the contestant picked a door, before revealing whether the chosen door contained a prize, Monty Hall would open one of the two remaining doors and show the contestant there was no prize behind that door. Then he would ask “Do you want to switch doors?” What would you do?\nWe can use probability to show that if you stick with the original door choice, your chances of winning a prize remain 1 in 3. However, if you switch to the other door, your chances of winning double to 2 in 3! This seems counterintuitive. Many people incorrectly think both chances are 1 in 2 since you are choosing between 2 options. You can watch a detailed mathematical explanation on Khan Academy2 or read one on Wikipedia3. Below we use a Monte Carlo simulation to see which strategy is better. Note that this code is written longer than it should be for pedagogical purposes.\nLet’s start with the stick strategy:\n\nB &lt;- 10000\nmonty_hall &lt;- function(strategy){\n  doors &lt;- as.character(1:3)\n  prize &lt;- sample(c(\"car\", \"goat\", \"goat\"))\n  prize_door &lt;- doors[prize == \"car\"]\n  my_pick  &lt;- sample(doors, 1)\n  show &lt;- sample(doors[!doors %in% c(my_pick, prize_door)],1)\n  stick &lt;- my_pick\n  stick == prize_door\n  switch &lt;- doors[!doors%in%c(my_pick, show)]\n  choice &lt;- ifelse(strategy == \"stick\", stick, switch)\n  choice == prize_door\n}\nstick &lt;- replicate(B, monty_hall(\"stick\"))\nmean(stick)\n#&gt; [1] 0.342\nswitch &lt;- replicate(B, monty_hall(\"switch\"))\nmean(switch)\n#&gt; [1] 0.668\n\nAs we write the code, we note that the lines starting with my_pick and show have no influence on the last logical operation when we stick to our original choice anyway. From this we should realize that the chance is 1 in 3, what we began with. When we switch, the Monte Carlo estimate confirms the 2/3 calculation. This helps us gain some insight by showing that we are removing a door, show, that is definitely not a winner from our choices. We also see that unless we get it right when we first pick, you win: 1 - 1/3 = 2/3.\n\n3.9.2 Birthday problem\nSuppose you are in a classroom with 50 people. If we assume this is a randomly selected group of 50 people, what is the chance that at least two people have the same birthday? Although it is somewhat advanced, we can deduce this mathematically. We will do this later. Here we use a Monte Carlo simulation. For simplicity, we assume nobody was born on February 29. This actually doesn’t change the answer much.\nFirst, note that birthdays can be represented as numbers between 1 and 365, so a sample of 50 birthdays can be obtained like this:\n\nn &lt;- 50\nbdays &lt;- sample(1:365, n, replace = TRUE)\n\nTo check if in this particular set of 50 people we have at least two with the same birthday, we can use the function duplicated, which returns TRUE whenever an element of a vector is a duplicate. Here is an example:\n\nduplicated(c(1, 2, 3, 1, 4, 3, 5))\n#&gt; [1] FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE\n\nThe second time 1 and 3 appear, we get a TRUE. So to check if two birthdays were the same, we simply use the any and duplicated functions like this:\n\nany(duplicated(bdays))\n#&gt; [1] TRUE\n\nIn this case, we see that it did happen. At least two people had the same birthday.\nTo estimate the probability of a shared birthday in the group, we repeat this experiment by sampling sets of 50 birthdays over and over:\n\nB &lt;- 10000\nsame_birthday &lt;- function(n){\n  bdays &lt;- sample(1:365, n, replace = TRUE)\n  any(duplicated(bdays))\n}\nresults &lt;- replicate(B, same_birthday(50))\nmean(results)\n#&gt; [1] 0.969\n\nWere you expecting the probability to be this high?\nPeople tend to underestimate these probabilities. To get an intuition as to why it is so high, think about what happens when the group size is close to 365. At this stage, we run out of days and the probability is one.\nSay we want to use this knowledge to bet with friends about two people having the same birthday in a group of people. When are the chances larger than 50%? Larger than 75%?\nLet’s create a look-up table. We can quickly create a function to compute this for any group size:\n\ncompute_prob &lt;- function(n, B = 10000){\n  results &lt;- replicate(B, same_birthday(n))\n  mean(results)\n}\n\nUsing the function sapply, we can perform element-wise operations on any function:\n\nn &lt;- seq(1,60)\nprob &lt;- sapply(n, compute_prob)\n\nWe can now make a plot of the estimated probabilities of two people having the same birthday in a group of size \\(n\\):\n\nlibrary(tidyverse)\nprob &lt;- sapply(n, compute_prob)\nqplot(n, prob)\n\n\n\n\n\n\n\nNow let’s compute the exact probabilities rather than use Monte Carlo approximations. Not only do we get the exact answer using math, but the computations are much faster since we don’t have to generate experiments.\nTo make the math simpler, instead of computing the probability of it happening, we will compute the probability of it not happening. For this, we use the multiplication rule.\nLet’s start with the first person. The probability that person 1 has a unique birthday is 1. The probability that person 2 has a unique birthday, given that person 1 already took one, is 364/365. Then, given that the first two people have unique birthdays, person 3 is left with 363 days to choose from. We continue this way and find the chances of all 50 people having a unique birthday is:\n\\[\n1 \\times \\frac{364}{365}\\times\\frac{363}{365} \\dots \\frac{365-n + 1}{365}\n\\]\nWe can write a function that does this for any number:\n\nexact_prob &lt;- function(n){\n  prob_unique &lt;- seq(365,365 - n + 1)/365 \n  1 - prod( prob_unique)\n}\neprob &lt;- sapply(n, exact_prob)\nqplot(n, prob) + geom_line(aes(n, eprob), col = \"red\")\n\n\n\n\n\n\n\nThis plot shows that the Monte Carlo simulation provided a very good estimate of the exact probability. Had it not been possible to compute the exact probabilities, we would have still been able to accurately estimate the probabilities."
+    "text": "3.9 Examples\nIn this section, we describe two discrete probability popular examples: the Monty Hall problem and the birthday problem. We use R to help illustrate the mathematical concepts.\n\n3.9.1 Monty Hall problem\nIn the 1970s, there was a game show called “Let’s Make a Deal,” with Monty Hall as the host. At some point in the game, contestants were asked to pick one of three doors. Behind one door, there was a prize, while the other doors had a goat behind them to show the contestant had lost. After the contestant picked a door, before revealing whether the chosen door contained a prize, Monty Hall would open one of the two remaining doors and reveal to the contestant that there was no prize behind that door. Then, he would ask, “Do you want to switch doors?” What would you do?\nWe can use probability to demonstrate that if you stick with the original door choice, your chances of winning a prize remain 1 in 3. However, if you switch to the other door, your chances of winning double to 2 in 3! This might seem counterintuitive. Many people incorrectly think both chances are 1 in 2 since you are choosing between 2 options. You can watch a detailed mathematical explanation on Khan Academy2 or read one on Wikipedia3. Below, we use a Monte Carlo simulation to see which strategy is better. Note that this code is written longer than it should be for pedagogical purposes.\nLet’s start with the stick strategy:\n\nB &lt;- 10000\nmonty_hall &lt;- function(strategy){\n  doors &lt;- as.character(1:3)\n  prize &lt;- sample(c(\"car\", \"goat\", \"goat\"))\n  prize_door &lt;- doors[prize == \"car\"]\n  my_pick  &lt;- sample(doors, 1)\n  show &lt;- sample(doors[!doors %in% c(my_pick, prize_door)],1)\n  stick &lt;- my_pick\n  stick == prize_door\n  switch &lt;- doors[!doors %in% c(my_pick, show)]\n  choice &lt;- ifelse(strategy == \"stick\", stick, switch)\n  choice == prize_door\n}\nstick &lt;- replicate(B, monty_hall(\"stick\"))\nmean(stick)\n#&gt; [1] 0.342\nswitch &lt;- replicate(B, monty_hall(\"switch\"))\nmean(switch)\n#&gt; [1] 0.668\n\nAs we write the code, we see that the lines starting with my_pick and show have no influence on the last logical operation, when we stick to our original choice. From this, we should realize that the chance is 1 in 3, as we initially considered. When we switch, the Monte Carlo estimate confirms the 2/3 calculation. This helps us gain some insight by demonstrating that we are removing a door, show, that is definitely not a winner from our choices. We also see that unless we get it right when we first pick, we win 1 - 1/3 = 2/3 of the times.\n\n3.9.2 Birthday problem\nSuppose you are in a classroom with 50 people. If we assume this is a randomly selected group of 50 people, what is the chance that at least two people have the same birthday? Although it is somewhat advanced, we can deduce this mathematically. We will do that later. Here, we use a Monte Carlo simulation. For simplicity, we assume nobody was born on February 29, which doesn’t significantly change the answer.\nFirst, note that birthdays can be represented as numbers between 1 and 365, so a sample of 50 birthdays can be obtained as follows:\n\nn &lt;- 50\nbdays &lt;- sample(1:365, n, replace = TRUE)\n\nTo check if there are at least two people with the same birthday in this particular set of 50 people, we can use the duplicated function, which returns TRUE whenever an element of a vector is a duplicate. Here is an example:\n\nduplicated(c(1, 2, 3, 1, 4, 3, 5))\n#&gt; [1] FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE\n\nThe second time 1 and 3 appear, we get a TRUE. So to check if two birthdays were the same, we simply use the any and duplicated functions like this:\n\nany(duplicated(bdays))\n#&gt; [1] TRUE\n\nIn this case, we see that it did happen; there were at least two people who had the same birthday.\nTo estimate the probability of a shared birthday in the group, we repeat this experiment by repeatedly sampling sets of 50 birthdays:\n\nB &lt;- 10000\nsame_birthday &lt;- function(n){\n  bdays &lt;- sample(1:365, n, replace = TRUE)\n  any(duplicated(bdays))\n}\nresults &lt;- replicate(B, same_birthday(50))\nmean(results)\n#&gt; [1] 0.969\n\nWere you expecting the probability to be this high?\nPeople tend to underestimate these probabilities. To get an intuition as to why it is so high, think about what happens when the group size is close to 365. At this stage, we run out of days and the probability is one.\nLet’s say we want to use this knowledge to make bet with friends about the likelihood of two people sharing the same birthday in a group. At what group size do the chances become greater than 50%? Greater than 75%?\nLet’s create a look-up table. We can quickly create a function to compute this for any group size:\n\ncompute_prob &lt;- function(n, B = 10000){\n  results &lt;- replicate(B, same_birthday(n))\n  mean(results)\n}\n\nUsing the function sapply, we can perform element-wise operations on any function:\n\nn &lt;- seq(1,60)\nprob &lt;- sapply(n, compute_prob)\n\nWe can now make a plot of the estimated probabilities of two people having the same birthday in a group of size \\(n\\):\n\nlibrary(tidyverse)\nprob &lt;- sapply(n, compute_prob)\nqplot(n, prob)\n\n\n\n\n\n\n\nNow, let’s compute the exact probabilities instead of relying on Monte Carlo approximations. Not only do we obtain the exact answer using math, but the computations are much faster since we did not have to generate experiments.\nTo make the math simpler, instead of computing the probability of it happening, we will compute the probability of it not happening. For this, we use the multiplication rule.\nLet’s start with the first person. The probability that person 1 has a unique birthday is 1. The probability that person 2 has a unique birthday, given that person 1 already took one day, is 364/365. Subsequently, given that the first two people have unique birthdays, person 3 is left with 363 days to choose from. This pattern continues, and we find that the chances of all 50 people having a unique birthday is:\n\\[\n1 \\times \\frac{364}{365}\\times\\frac{363}{365} \\dots \\frac{365-n + 1}{365}\n\\]\nWe can write a function that does this for any number:\n\nexact_prob &lt;- function(n){\n  prob_unique &lt;- seq(365, 365 - n + 1)/365 \n  1 - prod( prob_unique)\n}\neprob &lt;- sapply(n, exact_prob)\nqplot(n, prob) + geom_line(aes(n, eprob), col = \"red\")\n\n\n\n\n\n\n\nThis plot shows that the Monte Carlo simulation provided a very good estimate of the exact probability. Had we not been able to compute the exact probabilities, we could still accurately estimate them."
   },
   {
     "objectID": "prob/discrete-probability.html#infinity-in-practice",
     "href": "prob/discrete-probability.html#infinity-in-practice",
     "title": "\n3  Discrete probability\n",
     "section": "\n3.10 Infinity in practice",
-    "text": "3.10 Infinity in practice\nThe theory described here requires repeating experiments over and over forever. In practice we can’t do this. In the examples above, we used \\(B=10,000\\) Monte Carlo experiments and it turned out that this provided accurate estimates. The larger this number, the more accurate the estimate becomes until the approximaton is so good that your computer can’t tell the difference. But in more complex calculations, 10,000 may not be nearly enough. Also, for some calculations, 10,000 experiments might not be computationally feasible. In practice, we won’t know what the answer is, so we won’t know if our Monte Carlo estimate is accurate. We know that the larger \\(B\\), the better the approximation. But how big do we need it to be? This is actually a challenging question and answering it often requires advanced theoretical statistics training.\nOne practical approach we will describe here is to check for the stability of the estimate. The following is an example with the birthday problem for a group of 25 people.\n\nB &lt;- 10^seq(1, 5, len = 100)\ncompute_prob &lt;- function(B, n=25){\n  same_day &lt;- replicate(B, same_birthday(n))\n  mean(same_day)\n}\nprob &lt;- sapply(B, compute_prob)\nplot(log10(B), prob)\n\n\n\n\n\n\n\nIn this plot, we can see that the values start to stabilize (that is, they vary less than .01) around 1000. Note that the exact probability, which we know in this case, is 0.5686997."
+    "text": "3.10 Infinity in practice\nThe theory described here requires repeating experiments over and over indefinitely. In practice, we can’t do this. In the examples above, we used \\(B=10,000\\) Monte Carlo experiments, yielding accurate estimates. The larger this number, the more accurate the estimate becomes, until the approximation is so good that your computer can’t tell the difference. However, in more complex calculations, 10,000 may not be nearly enough. Moreover, for some calculations, 10,000 experiments might not be computationally feasible.\nIn practical scenarios, we won’t know what the answer is beforehand, so we won’t know if our Monte Carlo estimate is accurate. We know that the larger the \\(B\\), the better the approximation. But how large do we need it to be? This is actually a challenging question, and answering it often requires advanced theoretical statistics training.\nOne practical approach is to check for the stability of the estimate. The following example illustrates the birthday problem for a group of 25 people.\n\nB &lt;- 10^seq(1, 5, len = 100)\ncompute_prob &lt;- function(B, n = 25){\n  same_day &lt;- replicate(B, same_birthday(n))\n  mean(same_day)\n}\nprob &lt;- sapply(B, compute_prob)\nplot(log10(B), prob)\n\n\n\n\n\n\n\nIn this plot, we can see that the values start to stabilize at around 1000. Note that the exact probability, which is known in this case, is 0.5686997."
   },
   {
     "objectID": "prob/discrete-probability.html#exercises",
     "href": "prob/discrete-probability.html#exercises",
     "title": "\n3  Discrete probability\n",
     "section": "\n3.11 Exercises",
-    "text": "3.11 Exercises\n1. One ball will be drawn at random from a box containing: 3 cyan balls, 5 magenta balls, and 7 yellow balls. What is the probability that the ball will be cyan?\n2. What is the probability that the ball will not be cyan?\n3. Instead of taking just one draw, consider taking two draws. You take the second draw without returning the first draw to the box. We call this sampling without replacement. What is the probability that the first draw is cyan and that the second draw is not cyan?\n4. Now repeat the experiment, but this time, after taking the first draw and recording the color, return it to the box and shake the box. We call this sampling with replacement. What is the probability that the first draw is cyan and that the second draw is not cyan?\n5. Two events \\(A\\) and \\(B\\) are independent if \\(\\mbox{Pr}(A \\mbox{ and } B) = \\mbox{Pr}(A) P(B)\\). Under which situation are the draws independent?\n\nYou don’t replace the draw.\nYou replace the draw.\nNeither\nBoth\n\n6. Say you’ve drawn 5 balls from the box, with replacement, and all have been yellow. What is the probability that the next one is yellow?\n7. If you roll a 6-sided die six times, what is the probability of not seeing a 6?\n8. Two teams, say the Celtics and the Cavs, are playing a seven game series. The Cavs are a better team and have a 60% chance of winning each game. What is the probability that the Celtics win at least one game?\n9. Create a Monte Carlo simulation to confirm your answer to the previous problem. Use B &lt;- 10000 simulations. Hint: use the following code to generate the results of the first four games:\n\nceltic_wins &lt;- sample(c(0,1), 4, replace = TRUE, prob = c(0.6, 0.4))\n\nThe Celtics must win one of these 4 games.\n10. Two teams, say the Cavs and the Warriors, are playing a seven game championship series. The first to win four games, therefore, wins the series. The teams are equally good so they each have a 50-50 chance of winning each game. If the Cavs lose the first game, what is the probability that they win the series?\n11. Confirm the results of the previous question with a Monte Carlo simulation.\n12. Two teams, \\(A\\) and \\(B\\), are playing a seven game series. Team \\(A\\) is better than team \\(B\\) and has a \\(p&gt;0.5\\) chance of winning each game. Given a value \\(p\\), the probability of winning the series for the underdog team \\(B\\) can be computed with the following function based on a Monte Carlo simulation:\n\nprob_win &lt;- function(p){\n  B &lt;- 10000\n  result &lt;- replicate(B, {\n    b_win &lt;- sample(c(1,0), 7, replace = TRUE, prob = c(1-p, p))\n    sum(b_win)&gt;=4\n  })\n  mean(result)\n}\n\nUse the function sapply to compute the probability, call it Pr, of winning for p &lt;- seq(0.5, 0.95, 0.025). Then plot the result.\n13. Repeat the exercise above, but now keep the probability fixed at p &lt;- 0.75 and compute the probability for different series lengths: best of 1 game, 3 games, 5 games,… Specifically, N &lt;- seq(1, 25, 2). Hint: use this function:\n\nprob_win &lt;- function(N, p=0.75){\n  B &lt;- 10000\n  result &lt;- replicate(B, {\n    b_win &lt;- sample(c(1,0), N, replace = TRUE, prob = c(1-p, p))\n    sum(b_win)&gt;=(N+1)/2\n  })\n  mean(result)\n}"
+    "text": "3.11 Exercises\n1. One ball will be drawn at random from a box containing: 3 cyan balls, 5 magenta balls, and 7 yellow balls. What is the probability that the ball will be cyan?\n2. What is the probability that the ball will not be cyan?\n3. Instead of taking just one draw, consider taking two draws. You take the second draw without returning the first draw to the box. We call this sampling without replacement. What is the probability of the first draw being cyan and the second draw not being cyan?\n4. Now repeat the experiment, but this time, after taking the first draw and recording the color, return it to the box and shake the box. We call this sampling with replacement. What is the probability of the first draw being cyan and the second draw not being cyan?\n5. Two events \\(A\\) and \\(B\\) are independent if \\(\\mbox{Pr}(A \\mbox{ and } B) = \\mbox{Pr}(A) P(B)\\). Under which situation are the draws independent?\n\nYou don’t replace the draw.\nYou replace the draw.\nNeither.\nBoth.\n\n6. Let’s say you’ve drawn 5 balls from the box, with replacement, and all have been yellow. What is the probability that the next one will be yellow?\n7. If you roll a 6-sided die six times, what is the probability of not seeing a 6?\n8. Two teams, let’s say the Celtics and the Cavs, are playing a seven game series. The Cavs are a better team and have a 60% chance of winning each game. What is the probability that the Celtics will win at least one game?\n9. Create a Monte Carlo simulation to confirm your answer to the previous problem. Use B &lt;- 10000 simulations. Hint: use the following code to generate the results of the first four games:\n\nceltic_wins &lt;- sample(c(0,1), 4, replace = TRUE, prob = c(0.6, 0.4))\n\nThe Celtics must win one of these 4 games.\n10. Two teams, say the Cavs and the Warriors, are playing a seven game championship series. The first to win four games, therefore, wins the series. The teams are equally good so they each have a 50-50 chance of winning each game. If the Cavs lose the first game, what is the probability that they win the series?\n11. Confirm the results of the previous question with a Monte Carlo simulation.\n12. Two teams, \\(A\\) and \\(B\\), are playing a seven game series. Team \\(A\\) is better than team \\(B\\) and has a \\(p&gt;0.5\\) chance of winning each game. Given a value \\(p\\), the probability of winning the series for the underdog team \\(B\\) can be computed with the following function based on a Monte Carlo simulation:\n\nprob_win &lt;- function(p){\n  B &lt;- 10000\n  result &lt;- replicate(B, {\n    b_win &lt;- sample(c(1,0), 7, replace = TRUE, prob = c(1 - p, p))\n    sum(b_win)&gt;=4\n  })\n  mean(result)\n}\n\nUse the function sapply to compute the probability, call it Pr, of winning for p &lt;- seq(0.5, 0.95, 0.025). Then plot the result.\n13. Repeat the exercise above, but now keep the probability fixed at p &lt;- 0.75 and compute the probability for different series lengths: best of 1 game, best of 3 games, best of 5 games,… Specifically, N &lt;- seq(1, 25, 2). Hint: use the function below.\n\nprob_win &lt;- function(N, p = 0.75){\n  B &lt;- 10000\n  result &lt;- replicate(B, {\n    b_win &lt;- sample(c(1,0), N, replace = TRUE, prob = c(1 - p, p))\n    sum(b_win) &gt;= (N+1)/2\n  })\n  mean(result)\n}"
   },
   {
     "objectID": "prob/discrete-probability.html#footnotes",
@@ -256,112 +256,112 @@
     "href": "prob/continuous-probability.html#sec-cdf-intro",
     "title": "\n4  Continuous probability\n",
     "section": "\n4.1 Cumulative distribution functions",
-    "text": "4.1 Cumulative distribution functions\nWe used the heights of adult male students as an example\n\nlibrary(tidyverse)\nlibrary(dslabs)\nx &lt;- heights %&gt;% filter(sex==\"Male\") %&gt;% pull(height)\n\nand defined the empirical cumulative distribution function (eCDF) as\n\nF &lt;- function(a) mean(x&lt;=a)\n\nwhich, for any value a, gives the proportion of values in the list x that are smaller or equal than a.\nLet’s connect the eCDF to probability by asking: if I pick one of the male students at random, what is the chance that he is taller than 70.5 inches? Because every student has the same chance of being picked, the answer to this is equivalent to the proportion of students that are taller than 70.5 inches. Using the eCDF we obtain an answer by typing:\n\n1 - F(70)\n#&gt; [1] 0.377\n\nThe CDF is a version of the eCDF that assigns theoretical probabilities for each \\(a\\) rather than proportions computed from data. Although, as we just demonstrated, proportions computed from data can be used to define probabilities for a random variable. Specifically, the CDF for a random outcome \\(X\\) defines, for any number \\(a\\), the probability of observing a value larger than \\(a\\).\n\\[ F(a) = \\mbox{Pr}(X \\leq a) \\]\nOnce a CDF is defined, we can use it to compute the probability of any subset of values. For instance, the probability of a student being between height a and height b is:\n\\[\n\\mbox{Pr}(a &lt; X \\leq b) = F(b)-F(a)\n\\]\nBecause we can compute the probability for any possible event this way, the CDF defines the probability distribution."
+    "text": "4.1 Cumulative distribution functions\nWe used the heights of adult male students as an example:\n\nlibrary(tidyverse)\nlibrary(dslabs)\nx &lt;- heights %&gt;% filter(sex == \"Male\") %&gt;% pull(height)\n\nand defined the empirical cumulative distribution function (eCDF) as\n\nF &lt;- function(a) mean(x &lt;= a)\n\nwhich, for any value a, gives the proportion of values in the list x that are smaller or equal than a.\nLet’s connect the eCDF to probability by asking: if I randomly pick one of the male students, what is the chance that he is taller than 70.5 inches? Since every student has the same chance of being picked, the answer to this is equivalent to the proportion of students that are taller than 70.5 inches. Using the eCDF we obtain an answer by typing:\n\n1 - F(70)\n#&gt; [1] 0.377\n\nThe CDF is a version of the eCDF that assigns theoretical probabilities for each \\(a\\) rather than proportions computed from data. Although, as we just demonstrated, proportions computed from data can be used to define probabilities for a random variable. Specifically, the CDF for a random outcome \\(X\\) defines, for any number \\(a\\), the probability of observing a value larger than \\(a\\).\n\\[ F(a) = \\mbox{Pr}(X \\leq a) \\]\nOnce a CDF is defined, we can use it to compute the probability of any subset of values. For instance, the probability of a student being between height a and height b is:\n\\[\n\\mbox{Pr}(a &lt; X \\leq b) = F(b)-F(a)\n\\]\nSince we can compute the probability for any possible event using this approach, the CDF defines the probability distribution."
   },
   {
     "objectID": "prob/continuous-probability.html#probability-density-function",
     "href": "prob/continuous-probability.html#probability-density-function",
     "title": "\n4  Continuous probability\n",
     "section": "\n4.2 Probability density function",
-    "text": "4.2 Probability density function\nA mathematical result that is actually very useful in practice is that for most CDFs we can define a function, call it \\(f(x)\\), that permits us to construct the CDF using Calculus, like this:\n\\[\nF(b) - F(a) = \\int_a^b f(x)\\,dx\n\\] \\(f(x)\\) is referred to as the probability density function. The intuition is that even for continuous outcomes we can define tiny intervals, that are almost as small as points, that have positive probabilities. If we think of the size of these intervals as the base of a rectangle,the probability density function \\(f\\) determines the height of the rectangle so that the summing up the area of these rectangles approximate the probability \\(F(b) - F(a)\\). This sum can be written as Reimann sum that is approximated by an integral:\n\n\n\n\n\n\n\n\nAn example of such a continuous distribution is the normal distribution. As we saw in Section 1.5, the probability density function is given by:\n\\[f(x) = e^{-\\frac{1}{2}\\left( \\frac{x-m}{s} \\right)^2} \\]\nThe cumulative distribution for the normal distribution is defined by a mathematical formula which in R can be obtained with the function pnorm. We say that a random quantity is normally distributed with average m and standard deviation s if its probability distribution is defined by:\n\nF(a) = pnorm(a, m, s)\n\nThis is useful because if we are willing to use the normal approximation for, say, height, we don’t need the entire dataset to answer questions such as: what is the probability that a randomly selected student is taller then 70 inches? We just need the average height and standard deviation:\n\nm &lt;- mean(x)\ns &lt;- sd(x)\n1 - pnorm(70.5, m, s)\n#&gt; [1] 0.371"
+    "text": "4.2 Probability density function\nA mathematical result that is very useful in practice is that, for most CDFs, we can define a function, call it \\(f(x)\\), that permits us to construct the CDF using Calculus, like this:\n\\[\nF(b) - F(a) = \\int_a^b f(x)\\,dx\n\\]\n\\(f(x)\\) is referred to as the probability density function. The intuition is that even for continuous outcomes we can define tiny intervals, that are almost as small as points, that have positive probabilities. If we think of the size of these intervals as the base of a rectangle, the probability density function \\(f\\) determines the height of the rectangle so that the summing up of the area of these rectangles approximate the probability \\(F(b) - F(a)\\). This sum can be written as Reimann sum that is approximated by an integral:\n\n\n\n\n\n\n\n\nAn example of such a continuous distribution is the normal distribution. As we saw in Section 1.5, the probability density function is given by:\n\\[f(x) = e^{-\\frac{1}{2}\\left( \\frac{x-m}{s} \\right)^2} \\]\nThe cumulative distribution for the normal distribution is defined by a mathematical formula which in R can be obtained with the function pnorm. We say that a random quantity is normally distributed with average m and standard deviation s if its probability distribution is defined by:\n\nF(a) = pnorm(a, m, s)\n\nThis is useful because, if we are willing to use the normal approximation for, let’s say, height, we don’t need the entire dataset to answer questions such as: What is the probability that a randomly selected student is taller then 70 inches? We just need the average height and standard deviation:\n\nm &lt;- mean(x)\ns &lt;- sd(x)\n1 - pnorm(70.5, m, s)\n#&gt; [1] 0.371"
   },
   {
     "objectID": "prob/continuous-probability.html#theoretical-distributions-as-approximations",
     "href": "prob/continuous-probability.html#theoretical-distributions-as-approximations",
     "title": "\n4  Continuous probability\n",
     "section": "\n4.3 Theoretical distributions as approximations",
-    "text": "4.3 Theoretical distributions as approximations\nThe normal distribution is derived mathematically: we do not need data to define it. For practicing data scientists, almost everything we do involves data. Data is always, technically speaking, discrete. For example, we could consider our height data categorical with each specific height a unique category. The probability distribution is defined by the proportion of students reporting each height. Here is a plot of that probability distribution:\n\n\n\n\n\n\n\n\nWhile most students rounded up their heights to the nearest inch, others reported values with more precision. One student reported his height to be 69.6850393700787, which is 177 centimeters. The probability assigned to this height is 0.0012315 or 1 in 812. The probability for 70 inches is much higher at 0.1059113, but does it really make sense to think of the probability of being exactly 70 inches as being different than 69.6850393700787? Clearly it is much more useful for data analytic purposes to treat this outcome as a continuous numeric variable, keeping in mind that very few people, or perhaps none, are exactly 70 inches, and that the reason we get more values at 70 is because people round to the nearest inch.\nWith continuous distributions, the probability of a singular value is not even defined. For example, it does not make sense to ask what is the probability that a normally distributed value is 70. Instead, we define probabilities for intervals. We thus could ask what is the probability that someone is between 69.5 and 70.5.\nIn cases like height, in which the data is rounded, the normal approximation is particularly useful if we deal with intervals that include exactly one round number. For example, the normal distribution is useful for approximating the proportion of students reporting values in intervals like the following three:\n\nmean(x &lt;= 68.5) - mean(x &lt;= 67.5)\n#&gt; [1] 0.115\nmean(x &lt;= 69.5) - mean(x &lt;= 68.5)\n#&gt; [1] 0.119\nmean(x &lt;= 70.5) - mean(x &lt;= 69.5)\n#&gt; [1] 0.122\n\nNote how close we get with the normal approximation:\n\npnorm(68.5, m, s) - pnorm(67.5, m, s) \n#&gt; [1] 0.103\npnorm(69.5, m, s) - pnorm(68.5, m, s) \n#&gt; [1] 0.11\npnorm(70.5, m, s) - pnorm(69.5, m, s) \n#&gt; [1] 0.108\n\nHowever, the approximation is not as useful for other intervals. For instance, notice how the approximation breaks down when we try to estimate:\n\nmean(x &lt;= 70.9) - mean(x&lt;=70.1)\n#&gt; [1] 0.0222\n\nwith\n\npnorm(70.9, m, s) - pnorm(70.1, m, s)\n#&gt; [1] 0.0836\n\nIn general, we call this situation discretization. Although the true height distribution is continuous, the reported heights tend to be more common at discrete values, in this case, due to rounding. As long as we are aware of how to deal with this reality, the normal approximation can still be a very useful tool."
+    "text": "4.3 Theoretical distributions as approximations\nThe normal distribution is derived mathematically; we do not need data to define it. For practicing data scientists, almost everything we do involves data. Data is always, technically speaking, discrete. For example, we could consider our height data categorical, with each specific height a unique category. The probability distribution is defined by the proportion of students reporting each height. Below is a plot of that probability distribution:\n\n\n\n\n\n\n\n\nWhile most students rounded up their heights to the nearest inch, others reported values with more precision. One student reported his height to be 69.6850393700787, which is 177 centimeters. The probability assigned to this height is 0.0012315 or 1 in 812. The probability for 70 inches is much higher at 0.1059113, but does it really make sense to think of the probability of being exactly 70 inches as being different than 69.6850393700787? Clearly it is much more useful for data analytic purposes to treat this outcome as a continuous numeric variable, keeping in mind that very few people, or perhaps none, are exactly 70 inches, and that the reason we get more values at 70 is because people round to the nearest inch.\nWith continuous distributions, the probability of a singular value is not even defined. For instance, it does not make sense to ask what is the probability that a normally distributed value is 70. Instead, we define probabilities for intervals. We therefore could ask, what is the probability that someone is between 69.5 and 70.5?\nIn cases like height, in which the data is rounded, the normal approximation is particularly useful if we deal with intervals that include exactly one round number. For example, the normal distribution is useful for approximating the proportion of students reporting values in intervals like the following three:\n\nmean(x &lt;= 68.5) - mean(x &lt;= 67.5)\n#&gt; [1] 0.115\nmean(x &lt;= 69.5) - mean(x &lt;= 68.5)\n#&gt; [1] 0.119\nmean(x &lt;= 70.5) - mean(x &lt;= 69.5)\n#&gt; [1] 0.122\n\nNote how close we get with the normal approximation:\n\npnorm(68.5, m, s) - pnorm(67.5, m, s) \n#&gt; [1] 0.103\npnorm(69.5, m, s) - pnorm(68.5, m, s) \n#&gt; [1] 0.11\npnorm(70.5, m, s) - pnorm(69.5, m, s) \n#&gt; [1] 0.108\n\nHowever, the approximation is not as useful for other intervals. For instance, notice how the approximation breaks down when we try to estimate:\n\nmean(x &lt;= 70.9) - mean(x &lt;= 70.1)\n#&gt; [1] 0.0222\n\nwith:\n\npnorm(70.9, m, s) - pnorm(70.1, m, s)\n#&gt; [1] 0.0836\n\nIn general, we call this situation discretization. Although the true height distribution is continuous, the reported heights tend to be more common at discrete values, in this case, due to rounding. As long as we are aware of how to deal with this reality, the normal approximation can still be a very useful tool."
   },
   {
     "objectID": "prob/continuous-probability.html#the-probability-density",
     "href": "prob/continuous-probability.html#the-probability-density",
     "title": "\n4  Continuous probability\n",
     "section": "\n4.4 The probability density",
-    "text": "4.4 The probability density\nFor categorical distributions, we can define the probability of a category. For example, a roll of a die, let’s call it \\(X\\), can be 1, 2, 3, 4, 5 or 6. The probability of 4 is defined as:\n\\[\n\\mbox{Pr}(X=4) = 1/6\n\\]\nThe CDF can then easily be defined: \\[\nF(4) = \\mbox{Pr}(X\\leq 4) =  \\mbox{Pr}(X = 4) +  \\mbox{Pr}(X = 3) +  \\mbox{Pr}(X = 2) +  \\mbox{Pr}(X = 1)\n\\]\nAlthough for continuous distributions the probability of a single value \\(\\mbox{Pr}(X=x)\\) is not defined, there is a theoretical definition that has a similar interpretation. The probability density at \\(x\\) is defined as the function \\(f(a)\\) such that:\n\\[\nF(a) = \\mbox{Pr}(X\\leq a) = \\int_{-\\infty}^a f(x)\\, dx\n\\]\nFor those that know calculus, remember that the integral is related to a sum: it is the sum of bars with widths approximating 0. If you don’t know calculus, you can think of \\(f(x)\\) as a curve for which the area under that curve up to the value \\(a\\), gives you the probability \\(\\mbox{Pr}(X\\leq a)\\).\nFor example, to use the normal approximation to estimate the probability of someone being taller than 76 inches, we use:\n\n1 - pnorm(76, m, s)\n#&gt; [1] 0.0321\n\nwhich mathematically is the grey area below:\n\n\n\n\n\n\n\n\nThe curve you see is the probability density for the normal distribution. In R, we get this using the function dnorm.\nAlthough it may not be immediately obvious why knowing about probability densities is useful, understanding this concept will be essential to those wanting to fit models to data for which predefined functions are not available."
+    "text": "4.4 The probability density\nFor categorical distributions, we can define the probability of a category. For example, a roll of a die, let’s call it \\(X\\), can be 1, 2, 3, 4, 5 or 6. The probability of 4 is defined as:\n\\[\n\\mbox{Pr}(X=4) = 1/6\n\\]\nThe CDF can then easily be defined:\n\\[\nF(4) = \\mbox{Pr}(X\\leq 4) =  \\mbox{Pr}(X = 4) +  \\mbox{Pr}(X = 3) +  \\mbox{Pr}(X = 2) +  \\mbox{Pr}(X = 1)\n\\]\nAlthough for continuous distributions the probability of a single value \\(\\mbox{Pr}(X=x)\\) is not defined, there is a theoretical definition that has a similar interpretation. The probability density at \\(x\\) is defined as the function \\(f(a)\\) such that:\n\\[\nF(a) = \\mbox{Pr}(X\\leq a) = \\int_{-\\infty}^a f(x)\\, dx\n\\]\nFor those that know calculus, remember that the integral is related to a sum: it is the sum of bars with widths approximating 0. If you don’t know calculus, you can think of \\(f(x)\\) as a curve for which the area under that curve, up to the value \\(a\\), gives you the probability \\(\\mbox{Pr}(X\\leq a)\\).\nFor example, to use the normal approximation to estimate the probability of someone being taller than 76 inches, we use:\n\n1 - pnorm(76, m, s)\n#&gt; [1] 0.0321\n\nwhich mathematically is the grey area below:\n\n\n\n\n\n\n\n\nThe curve you see is the probability density for the normal distribution. In R, we get this using the function dnorm.\nWhile it may not be immediately apparent why knowing about probability densities is useful, understanding this concept is essential for individuals aiming to fit models to data for which predefined functions are not available."
   },
   {
     "objectID": "prob/continuous-probability.html#monte-carlo",
     "href": "prob/continuous-probability.html#monte-carlo",
     "title": "\n4  Continuous probability\n",
     "section": "\n4.5 Monte Carlo",
-    "text": "4.5 Monte Carlo\nR provides functions to generate normally distributed outcomes. Specifically, the rnorm function takes three arguments: size, average (defaults to 0), and standard deviation (defaults to 1) and produces random numbers. Here is an example of how we could generate data that looks like our reported heights:\n\nn &lt;- length(x)\nm &lt;- mean(x)\ns &lt;- sd(x)\nsimulated_heights &lt;- rnorm(n, m, s)\n\nNot surprisingly, the distribution looks normal:\n\n\n\n\n\n\n\n\nThis is one of the most useful functions in R as it will permit us to generate data that mimics natural events and answers questions related to what could happen by chance by running Monte Carlo simulations.\nIf, for example, we pick 800 males at random, what is the distribution of the tallest person? How rare is a seven footer in a group of 800 males? The following Monte Carlo simulation helps us answer that question:\n\nB &lt;- 10000\ntallest &lt;- replicate(B, {\n  simulated_data &lt;- rnorm(800, m, s)\n  max(simulated_data)\n})\n\nHaving a seven footer is quite rare:\n\nmean(tallest &gt;= 7*12)\n#&gt; [1] 0.0156\n\nHere is the resulting distribution:\n\n\n\n\n\n\n\n\nNote that it does not look normal."
+    "text": "4.5 Monte Carlo\nR provides functions to generate normally distributed outcomes. Specifically, the rnorm function takes three arguments: size, average (defaults to 0), and standard deviation (defaults to 1), and produces random numbers. Here is an example of how we could generate data that looks like our reported heights:\n\nn &lt;- length(x)\nm &lt;- mean(x)\ns &lt;- sd(x)\nsimulated_heights &lt;- rnorm(n, m, s)\n\nNot surprisingly, the distribution looks normal:\n\n\n\n\n\n\n\n\nThis is one of the most useful functions in R, as it will permit us to generate data that mimics natural events and answers questions related to what could happen by chance by running Monte Carlo simulations.\nIf, for example, we pick 800 males at random, what is the distribution of the tallest person? How rare is a seven-footer in a group of 800 males? The following Monte Carlo simulation helps us answer that question:\n\nB &lt;- 10000\ntallest &lt;- replicate(B, {\n  simulated_data &lt;- rnorm(800, m, s)\n  max(simulated_data)\n})\n\nHaving a seven-footer is quite rare:\n\nmean(tallest &gt;= 7*12)\n#&gt; [1] 0.0191\n\nHere is the resulting distribution:\n\n\n\n\n\n\n\n\nNote that it does not look normal."
   },
   {
     "objectID": "prob/continuous-probability.html#continuous-distributions",
     "href": "prob/continuous-probability.html#continuous-distributions",
     "title": "\n4  Continuous probability\n",
     "section": "\n4.6 Continuous distributions",
-    "text": "4.6 Continuous distributions\nThe normal distribution is not the only useful theoretical distribution. Other continuous distributions that we may encounter are the student-t, Chi-square, exponential, gamma, beta, and beta-binomial. R provides functions to compute the density, the quantiles, the cumulative distribution functions and to generate Monte Carlo simulations. R uses a convention that lets us remember the names, namely using the letters d, q, p, and r in front of a shorthand for the distribution. We have already seen the functions dnorm, pnorm, and rnorm for the normal distribution. The functions qnorm gives us the quantiles. We can therefore draw a distribution like this:\n\nx &lt;- seq(-4, 4, length.out = 100)\nqplot(x, f, geom = \"line\", data = data.frame(x, f = dnorm(x)))\n\nFor the student-t, described later in Section Section 10.2.3, the shorthand t is used so the functions are dt for the density, qt for the quantiles, pt for the cumulative distribution function, and rt for Monte Carlo simulation."
+    "text": "4.6 Continuous distributions\nThe normal distribution is not the only useful theoretical distribution. Other continuous distributions that we may encounter are the student-t, Chi-square, exponential, gamma, beta, and beta-binomial. R provides functions to compute the density, the quantiles, the cumulative distribution functions and to generate Monte Carlo simulations. R uses a convention that lets us remember the names, namely using the letters d, q, p, and r in front of a shorthand for the distribution. We have already seen the functions dnorm, pnorm, and rnorm for the normal distribution. The functions qnorm gives us the quantiles. We can therefore draw a distribution like this:\n\nx &lt;- seq(-4, 4, length.out = 100)\nqplot(x, f, geom = \"line\", data = data.frame(x, f = dnorm(x)))\n\nFor the student-t, described later in Section 11.2.3, the shorthand t is used so the functions are dt for the density, qt for the quantiles, pt for the cumulative distribution function, and rt for Monte Carlo simulation."
   },
   {
     "objectID": "prob/continuous-probability.html#exercises",
     "href": "prob/continuous-probability.html#exercises",
     "title": "\n4  Continuous probability\n",
     "section": "\n4.7 Exercises",
-    "text": "4.7 Exercises\n1. Assume the distribution of female heights is approximated by a normal distribution with a mean of 64 inches and a standard deviation of 3 inches. If we pick a female at random, what is the probability that she is 5 feet or shorter?\n2. Assume the distribution of female heights is approximated by a normal distribution with a mean of 64 inches and a standard deviation of 3 inches. If we pick a female at random, what is the probability that she is 6 feet or taller?\n3. Assume the distribution of female heights is approximated by a normal distribution with a mean of 64 inches and a standard deviation of 3 inches. If we pick a female at random, what is the probability that she is between 61 and 67 inches?\n4. Repeat the exercise above, but convert everything to centimeters. That is, multiply every height, including the standard deviation, by 2.54. What is the answer now?\n5. Notice that the answer to the question does not change when you change units. This makes sense since the standard deviations from the average for an entry in a list are not affected by what units we use. In fact, if you look closely, you notice that 61 and 67 are both 1 SD away from the average. Compute the probability that a randomly picked, normally distributed random variable is within 1 SD from the average.\n6. To see the math that explains why the answers to questions 3, 4, and 5 are the same, suppose we have a random variable with average \\(m\\) and standard error \\(s\\). Suppose we ask the probability of \\(X\\) being smaller or equal to \\(a\\). Remember that, by definition, \\(a\\) is \\((a - m)/s\\) standard deviations \\(s\\) away from the average \\(m\\). The probability is:\n\\[\n\\mbox{Pr}(X \\leq a)\n\\]\nNow we subtract \\(\\mu\\) to both sides and then divide both sides by \\(\\sigma\\):\n\\[\n\\mbox{Pr}\\left(\\frac{X-m}{s} \\leq \\frac{a-m}{s} \\right)\n\\]\nThe quantity on the left is a standard normal random variable. It has an average of 0 and a standard error of 1. We will call it \\(Z\\):\n\\[\n\\mbox{Pr}\\left(Z \\leq \\frac{a-m}{s} \\right)\n\\]\nSo, no matter the units, the probability of \\(X\\leq a\\) is the same as the probability of a standard normal variable being less than \\((a - m)/s\\). If mu is the average and sigma the standard error, which of the following R code would give us the right answer in every situation:\n\nmean(X&lt;=a)\npnorm((a - m)/s)\npnorm((a - m)/s, m, s)\npnorm(a)\n\n7. Imagine the distribution of male adults is approximately normal with an expected value of 69 and a standard deviation of 3. How tall is the male in the 99th percentile? Hint: use qnorm.\n8. The distribution of IQ scores is approximately normally distributed. The average is 100 and the standard deviation is 15. Suppose you want to know the distribution of the highest IQ across all graduating classes if 10,000 people are born each in your school district. Run a Monte Carlo simulation with B=1000 generating 10,000 IQ scores and keeping the highest. Make a histogram."
+    "text": "4.7 Exercises\n1. Assume the distribution of female heights is approximated by a normal distribution with a mean of 64 inches and a standard deviation of 3 inches. If we pick a female at random, what is the probability that she is 5 feet or shorter?\n2. Assume the distribution of female heights is approximated by a normal distribution with a mean of 64 inches and a standard deviation of 3 inches. If we pick a female at random, what is the probability that she is 6 feet or taller?\n3. Assume the distribution of female heights is approximated by a normal distribution with a mean of 64 inches and a standard deviation of 3 inches. If we pick a female at random, what is the probability that she is between 61 and 67 inches?\n4. Repeat the exercise above, but convert everything to centimeters. That is, multiply every height, including the standard deviation, by 2.54. What is the answer now?\n5. Notice that the answer to the question does not change when you change units. This makes sense since the standard deviations from the average for an entry in a list are not affected by what units we use. In fact, if you look closely, you notice that 61 and 67 are both 1 SD away from the average. Compute the probability that a randomly picked, normally distributed random variable is within 1 SD from the average.\n6. To understand the mathematical rationale that explains why the answers to exercises 3, 4, and 5 are the same, suppose we have a random variable with average \\(m\\) and standard error \\(s\\). Suppose we ask the probability of \\(X\\) being smaller or equal to \\(a\\). Remember that, by definition, \\(a\\) is \\((a - m)/s\\) standard deviations \\(s\\) away from the average \\(m\\). The probability is:\n\\[\n\\mbox{Pr}(X \\leq a)\n\\]\nNow we subtract \\(\\mu\\) to both sides and then divide both sides by \\(\\sigma\\):\n\\[\n\\mbox{Pr}\\left(\\frac{X-\\mu}{\\sigma} \\leq \\frac{a-\\mu}{\\sigma} \\right)\n\\]\nThe quantity on the left is a standard normal random variable. It has an average of 0 and a standard error of 1. We will call it \\(Z\\):\n\\[\n\\mbox{Pr}\\left(Z \\leq \\frac{a-\\mu}{\\sigma} \\right)\n\\]\nSo, no matter the units, the probability of \\(X\\leq a\\) is the same as the probability of a standard normal variable being less than \\((a - \\mu)/\\sigma\\). If mu is the average and sigma the standard error, which of the following R code would give us the right answer in every situation?\n\nmean(X &lt;= a)\npnorm((a - m)/s)\npnorm((a - m)/s, m, s)\npnorm(a)\n\n7. Imagine the distribution of male adults is approximately normal with an expected value of 69 and a standard deviation of 3. How tall is the male in the 99th percentile? Hint: use qnorm.\n8. The distribution of IQ scores is approximately normally distributed. The average is 100 and the standard deviation is 15. Suppose you want to know the distribution of the highest IQ across all graduating classes if 10,000 people are born each in your school district. Run a Monte Carlo simulation with B=1000 generating 10,000 IQ scores and keeping the highest. Make a histogram."
   },
   {
     "objectID": "prob/random-variables-sampling-models-clt.html#random-variables",
     "href": "prob/random-variables-sampling-models-clt.html#random-variables",
     "title": "\n5  Random variables\n",
     "section": "\n5.1 Random variables",
-    "text": "5.1 Random variables\nRandom variables are numeric outcomes resulting from random processes. We can easily generate random variables using some of the simple examples we have shown. For example, define X to be 1 if a bead is blue and red otherwise:\n\nbeads &lt;- rep( c(\"red\", \"blue\"), times = c(2,3))\nX &lt;- ifelse(sample(beads, 1) == \"blue\", 1, 0)\n\nHere X is a random variable: every time we select a new bead the outcome changes randomly. See below:\n\nifelse(sample(beads, 1) == \"blue\", 1, 0)\n#&gt; [1] 1\nifelse(sample(beads, 1) == \"blue\", 1, 0)\n#&gt; [1] 0\nifelse(sample(beads, 1) == \"blue\", 1, 0)\n#&gt; [1] 0\n\nSometimes it’s 1 and sometimes it’s 0."
+    "text": "5.1 Random variables\nRandom variables are numeric outcomes resulting from random processes. We can easily generate random variables using some of the simple examples we have shown. For example, define X to be 1 if a bead is blue and red otherwise:\n\nbeads &lt;- rep( c(\"red\", \"blue\"), times = c(2,3))\nX &lt;- ifelse(sample(beads, 1) == \"blue\", 1, 0)\n\nHere X is a random variable, changing randomly each time we select a new bead. See below:\n\nifelse(sample(beads, 1) == \"blue\", 1, 0)\n#&gt; [1] 1\nifelse(sample(beads, 1) == \"blue\", 1, 0)\n#&gt; [1] 0\nifelse(sample(beads, 1) == \"blue\", 1, 0)\n#&gt; [1] 0\n\nSometimes it’s 1 and sometimes it’s 0."
   },
   {
     "objectID": "prob/random-variables-sampling-models-clt.html#sampling-models",
     "href": "prob/random-variables-sampling-models-clt.html#sampling-models",
     "title": "\n5  Random variables\n",
     "section": "\n5.2 Sampling models",
-    "text": "5.2 Sampling models\nMany data generation procedures, those that produce the data we study, can be modeled quite well as draws from an urn. For instance, we can model the process of polling likely voters as drawing 0s (Republicans) and 1s (Democrats) from an urn containing the 0 and 1 code for all likely voters. In epidemiological studies, we often assume that the subjects in our study are a random sample from the population of interest. The data related to a specific outcome can be modeled as a random sample from an urn containing the outcome for the entire population of interest. Similarly, in experimental research, we often assume that the individual organisms we are studying, for example worms, flies, or mice, are a random sample from a larger population. Randomized experiments can also be modeled by draws from an urn given the way individuals are assigned into groups: when getting assigned, you draw your group at random. Sampling models are therefore ubiquitous in data science. Casino games offer a plethora of examples of real-world situations in which sampling models are used to answer specific questions. We will therefore start with such examples.\nSuppose a very small casino hires you to consult on whether they should set up roulette wheels. To keep the example simple, we will assume that 1,000 people will play and that the only game you can play on the roulette wheel is to bet on red or black. The casino wants you to predict how much money they will make or lose. They want a range of values and, in particular, they want to know what’s the chance of losing money. If this probability is too high, they will pass on installing roulette wheels.\nWe are going to define a random variable \\(S\\) that will represent the casino’s total winnings. Let’s start by constructing the urn. A roulette wheel has 18 red pockets, 18 black pockets and 2 green ones. So playing a color in one game of roulette is equivalent to drawing from this urn:\n\ncolor &lt;- rep(c(\"Black\", \"Red\", \"Green\"), c(18, 18, 2))\n\nThe 1,000 outcomes from 1,000 people playing are independent draws from this urn. If red comes up, the gambler wins and the casino loses a dollar, so we draw a -$1. Otherwise, the casino wins a dollar and we draw a $1. To construct our random variable \\(S\\), we can use this code:\n\nn &lt;- 1000\nX &lt;- sample(ifelse(color == \"Red\", -1, 1),  n, replace = TRUE)\nX[1:10]\n#&gt;  [1] -1  1  1 -1 -1 -1  1  1  1  1\n\nBecause we know the proportions of 1s and -1s, we can generate the draws with one line of code, without defining color:\n\nX &lt;- sample(c(-1,1), n, replace = TRUE, prob=c(9/19, 10/19))\n\nWe call this a sampling model since we are modeling the random behavior of roulette with the sampling of draws from an urn. The total winnings \\(S\\) is simply the sum of these 1,000 independent draws:\n\nX &lt;- sample(c(-1, 1), n, replace = TRUE, prob = c(9/19, 10/19))\nS &lt;- sum(X)\nS\n#&gt; [1] 22"
+    "text": "5.2 Sampling models\nMany data generation procedures, those that produce the data we study, can be effectively modeled as draws from an urn. For instance, we can model the process of polling likely voters as drawing 0s (Republicans) and 1s (Democrats) from an urn containing the 0 and 1 codes for all likely voters. In epidemiological studies, we often assume that the subjects in our study are a random sample from the population of interest. The data related to a specific outcome can be modeled as a random sample from an urn containing the outcomes for the entire population of interest. Similarly, in experimental research, we often assume that the individual organisms we are studying, for example worms, flies, or mice, are a random sample from a larger population. Randomized experiments can be modeled by draws from an urn, reflecting the way individuals are assigned into group; when getting assigned, individuals draw their group at random. Sampling models are therefore ubiquitous in data science. Casino games offer a plethora of real-world cases in which sampling models are used to answer specific questions. We will therefore start with these examples.\nSuppose a very small casino hires you to consult on whether they should set up roulette wheels. To keep the example simple, we will assume that 1,000 people will play, and that the only game available on the roulette wheel is to bet on red or black. The casino wants you to predict how much money they will make or lose. They want a range of values and, in particular, they want to know what’s the chance of losing money. If this probability is too high, they will decide against installing roulette wheels.\nWe are going to define a random variable \\(S\\) that will represent the casino’s total winnings. Let’s start by constructing the urn. A roulette wheel has 18 red pockets, 18 black pockets and 2 green ones. So playing a color in one game of roulette is equivalent to drawing from this urn:\n\ncolor &lt;- rep(c(\"Black\", \"Red\", \"Green\"), c(18, 18, 2))\n\nThe 1,000 outcomes from 1,000 people playing are independent draws from this urn. If red comes up, the gambler wins, and the casino loses a dollar, resulting in a draw a -$1. Otherwise, the casino wins a dollar, and we draw a $1. To construct our random variable \\(S\\), we can use this code:\n\nn &lt;- 1000\nX &lt;- sample(ifelse(color == \"Red\", -1, 1),  n, replace = TRUE)\nX[1:10]\n#&gt;  [1] -1  1  1 -1 -1 -1  1  1  1  1\n\nBecause we know the proportions of 1s and -1s, we can generate the draws with one line of code, without defining color:\n\nX &lt;- sample(c(-1, 1), n, replace = TRUE, prob = c(9/19, 10/19))\n\nWe call this a sampling model, as it involves modeling the random behavior of roulette through the sampling of draws from an urn. The total winnings \\(S\\) is simply the sum of these 1,000 independent draws:\n\nX &lt;- sample(c(-1, 1), n, replace = TRUE, prob = c(9/19, 10/19))\nS &lt;- sum(X)\nS\n#&gt; [1] 22"
   },
   {
     "objectID": "prob/random-variables-sampling-models-clt.html#the-probability-distribution-of-a-random-variable",
     "href": "prob/random-variables-sampling-models-clt.html#the-probability-distribution-of-a-random-variable",
     "title": "\n5  Random variables\n",
     "section": "\n5.3 The probability distribution of a random variable",
-    "text": "5.3 The probability distribution of a random variable\nIf you run the code above, you see that \\(S\\) changes every time. This is, of course, because \\(S\\) is a random variable. The probability distribution of a random variable tells us the probability of the observed value falling at any given interval. So, for example, if we want to know the probability that we lose money, we are asking the probability that \\(S\\) is in the interval \\(S&lt;0\\).\nNote that if we can define a cumulative distribution function \\(F(a) = \\mbox{Pr}(S\\leq a)\\), then we will be able to answer any question related to the probability of events defined by our random variable \\(S\\), including the event \\(S&lt;0\\). We call this \\(F\\) the random variable’s distribution function.\nWe can estimate the distribution function for the random variable \\(S\\) by using a Monte Carlo simulation to generate many realizations of the random variable. With this code, we run the experiment of having 1,000 people play roulette, over and over, specifically \\(B = 10,000\\) times:\n\nn &lt;- 1000\nB &lt;- 10000\nroulette_winnings &lt;- function(n){\n  X &lt;- sample(c(-1,1), n, replace = TRUE, prob = c(9/19, 10/19))\n  sum(X)\n}\nS &lt;- replicate(B, roulette_winnings(n))\n\nNow we can ask the following: in our simulations, how often did we get sums less than or equal to a?\n\nmean(S &lt;= a)\n\nThis will be a very good approximation of \\(F(a)\\) and we can easily answer the casino’s question: how likely is it that we will lose money? We can see it is quite low:\n\nmean(S &lt; 0)\n#&gt; [1] 0.0456\n\nWe can visualize the distribution of \\(S\\) by creating a histogram showing the probability \\(F(b)-F(a)\\) for several intervals \\((a,b]\\):\n\n\n\n\n\n\n\n\nWe see that the distribution appears to be approximately normal. A qq-plot will confirm that the normal approximation is close to a perfect approximation for this distribution. If, in fact, the distribution is normal, then all we need to define the distribution is the average and the standard deviation. Because we have the original values from which the distribution is created, we can easily compute these with mean(S) and sd(S). The blue curve you see added to the histogram above is a normal density with this average and standard deviation.\nThis average and this standard deviation have special names. They are referred to as the expected value and standard error of the random variable \\(S\\). We will say more about these in the next section.\nStatistical theory provides a way to derive the distribution of random variables defined as independent random draws from an urn. Specifically, in our example above, we can show that \\((S+n)/2\\) follows a binomial distribution. We therefore do not need to run for Monte Carlo simulations to know the probability distribution of \\(S\\). We did this for illustrative purposes.\nWe can use the function dbinom and pbinom to compute the probabilities exactly. For example, to compute \\(\\mbox{Pr}(S &lt; 0)\\) we note that:\n\\[\\mbox{Pr}(S &lt; 0) = \\mbox{Pr}((S+n)/2 &lt; (0+n)/2)\\]\nand we can use the pbinom to compute \\[\\mbox{Pr}(S \\leq 0)\\]\n\nn &lt;- 1000\npbinom(n/2, size = n, prob = 10/19)\n#&gt; [1] 0.0511\n\nBecause this is a discrete probability function, to get \\(\\mbox{Pr}(S &lt; 0)\\) rather than \\(\\mbox{Pr}(S \\leq 0)\\), we write:\n\npbinom(n/2 - 1, size = n, prob = 10/19)\n#&gt; [1] 0.0448\n\nFor the details of the binomial distribution, you can consult any basic probability book or even Wikipedia3.\nHere we do not cover these details. Instead, we will discuss an incredibly useful approximation provided by mathematical theory that applies generally to sums and averages of draws from any urn: the Central Limit Theorem (CLT)."
+    "text": "5.3 The probability distribution of a random variable\nIf you run the code above, you see that \\(S\\) changes every time. This is, of course, because \\(S\\) is a random variable. The probability distribution of a random variable informs us about the probability of the observed value falling at any given interval. For example, if we want to know the probability that we lose money, we are asking the probability that \\(S\\) is in the interval \\(S&lt;0\\).\nKeep in mind that if we can define a cumulative distribution function \\(F(a) = \\mbox{Pr}(S\\leq a)\\), then we will be able to answer any question related to the probability of events defined by our random variable \\(S\\), including the event \\(S&lt;0\\). We call this \\(F\\) the random variable’s distribution function.\nWe can estimate the distribution function for the random variable \\(S\\) by using a Monte Carlo simulation to generate many realizations of the random variable. With this code, we run the experiment of having 1,000 people repeatedly play roulette, specifically \\(B = 10,000\\) times:\n\nn &lt;- 1000\nB &lt;- 10000\nroulette_winnings &lt;- function(n){\n  X &lt;- sample(c(-1, 1), n, replace = TRUE, prob = c(9/19, 10/19))\n  sum(X)\n}\nS &lt;- replicate(B, roulette_winnings(n))\n\nNow, we can ask the following: in our simulations, how often did we get sums less than or equal to a?\n\nmean(S &lt;= a)\n\nThis will be a very good approximation of \\(F(a)\\), allowing us to easily answer the casino’s question: How likely is it that we will lose money? We can see it is quite low:\n\nmean(S &lt; 0)\n#&gt; [1] 0.0456\n\nWe can visualize the distribution of \\(S\\) by creating a histogram showing the probability \\(F(b)-F(a)\\) for several intervals \\((a,b]\\):\n\n\n\n\n\n\n\n\nWe see that the distribution appears to be approximately normal. A qqplot will confirm that the normal approximation is close to a perfect approximation for this distribution. In fact, if the distribution is normal, all we need to define it are the average and the standard deviation. Since we have the original values from which the distribution is created, we can easily compute these with mean(S) and sd(S). The blue curve added to the histogram above is a normal density with this average and standard deviation.\nThis average and this standard deviation have special names; they are referred to as the expected value and standard error of the random variable \\(S\\). More details on these concepts will be provided in the next section.\nStatistical theory offers a method to derive the distribution of random variables defined as the sum of independent random draw of numbers from an urn. Specifically, in our example above, we can demonstrate that \\((S+n)/2\\) follows a binomial distribution. We therefore do not need to run Monte Carlo simulations to determine the probability distribution of \\(S\\). The simulations were conducted for illustrative purposes.\nWe can use the function dbinom and pbinom to compute the probabilities exactly. For example, to compute \\(\\mbox{Pr}(S &lt; 0)\\), we note that:\n\\[\\mbox{Pr}(S &lt; 0) = \\mbox{Pr}((S+n)/2 &lt; (0+n)/2)\\]\nand we can use the pbinom to compute \\[\\mbox{Pr}(S \\leq 0)\\]:\n\nn &lt;- 1000\npbinom(n/2, size = n, prob = 10/19)\n#&gt; [1] 0.0511\n\nSince this is a discrete probability function, to obtain \\(\\mbox{Pr}(S &lt; 0)\\) rather than \\(\\mbox{Pr}(S \\leq 0)\\), we write:\n\npbinom(n/2 - 1, size = n, prob = 10/19)\n#&gt; [1] 0.0448\n\nFor the details of the binomial distribution, you can consult any basic probability book or even Wikipedia3.\nWe do not delve into these details here. Instead, we will explore an incredibly useful approximation provided by mathematical theory, which generally applies to sums and averages of draws from any urn: the Central Limit Theorem (CLT)."
   },
   {
     "objectID": "prob/random-variables-sampling-models-clt.html#distributions-versus-probability-distributions",
     "href": "prob/random-variables-sampling-models-clt.html#distributions-versus-probability-distributions",
     "title": "\n5  Random variables\n",
     "section": "\n5.4 Distributions versus probability distributions",
-    "text": "5.4 Distributions versus probability distributions\nBefore we continue, let’s make an important distinction and connection between the distribution of a list of numbers and a probability distribution. In the visualization chapter, we described how any list of numbers \\(x_1,\\dots,x_n\\) has a distribution. The definition is quite straightforward. We define \\(F(a)\\) as the function that tells us what proportion of the list is less than or equal to \\(a\\). Because they are useful summaries when the distribution is approximately normal, we define the average and standard deviation. These are defined with a straightforward operation of the vector containing the list of numbers x:\n\nm &lt;- sum(x)/length(x)\ns &lt;- sqrt(sum((x - m)^2) / length(x))\n\nA random variable \\(X\\) has a distribution function. To define this, we do not need a list of numbers. It is a theoretical concept. In this case, we define the distribution as the \\(F(a)\\) that answers the question: what is the probability that \\(X\\) is less than or equal to \\(a\\)? There is no list of numbers.\nHowever, if \\(X\\) is defined by drawing from an urn with numbers in it, then there is a list: the list of numbers inside the urn. In this case, the distribution of that list is the probability distribution of \\(X\\) and the average and standard deviation of that list are the expected value and standard error of the random variable.\nAnother way to think about it that does not involve an urn is to run a Monte Carlo simulation and generate a very large list of outcomes of \\(X\\). These outcomes are a list of numbers. The distribution of this list will be a very good approximation of the probability distribution of \\(X\\). The longer the list, the better the approximation. The average and standard deviation of this list will approximate the expected value and standard error of the random variable."
+    "text": "5.4 Distributions versus probability distributions\nBefore we continue, let’s establish an important distinction and connection between the distribution of a list of numbers and a probability distribution. Any list of numbers \\(x_1,\\dots,x_n\\) has a distribution. The definition is quite straightforward. We define \\(F(a)\\) as the function that indicates what proportion of the list is less than or equal to \\(a\\). Given their usefulness as summaries when the distribution is approximately normal, we also define the average and standard deviation. These are determined with a straightforward operation involving the vector containing the list of numbers, denoted as x:\n\nm &lt;- sum(x)/length(x)\ns &lt;- sqrt(sum((x - m)^2)/length(x))\n\nA random variable \\(X\\) has a distribution function. To define this, we do not need a list of numbers; it is a theoretical concept. In this case, we define the distribution as the \\(F(a)\\) that answers the question: What is the probability that \\(X\\) is less than or equal to \\(a\\)? There is no list of numbers.\nHowever, if \\(X\\) is defined by drawing from an urn containing numbers, then there exists a list: the list of numbers inside the urn. In this case, the distribution of that list is the probability distribution of \\(X\\), and the average and standard deviation of that list are the expected value and standard error of the random variable.\nAnother way to think about it without involving an urn is by running a Monte Carlo simulation and generating a very large list of outcomes of \\(X\\). These outcomes form a list of numbers, and the distribution of this list will be a very good approximation of the probability distribution of \\(X\\). The longer the list, the better the approximation. The average and standard deviation of this list will approximate the expected value and standard error of the random variable."
   },
   {
     "objectID": "prob/random-variables-sampling-models-clt.html#notation-for-random-variables",
     "href": "prob/random-variables-sampling-models-clt.html#notation-for-random-variables",
     "title": "\n5  Random variables\n",
     "section": "\n5.5 Notation for random variables",
-    "text": "5.5 Notation for random variables\nIn statistical textbooks, upper case letters are used to denote random variables and we follow this convention here. Lower case letters are used for observed values. You will see some notation that includes both. For example, you will see events defined as \\(X \\leq x\\). Here \\(X\\) is a random variable, making it a random event, and \\(x\\) is an arbitrary value and not random. So, for example, \\(X\\) might represent the number on a die roll and \\(x\\) will represent an actual value we see 1, 2, 3, 4, 5, or 6. So in this case, the probability of \\(X=x\\) is 1/6 regardless of the observed value \\(x\\). This notation is a bit strange because, when we ask questions about probability, \\(X\\) is not an observed quantity. Instead, it’s a random quantity that we will see in the future. We can talk about what we expect it to be, what values are probable, but not what it is. But once we have data, we do see a realization of \\(X\\). So data scientists talk of what could have been after we see what actually happened."
+    "text": "5.5 Notation for random variables\nIn statistical textbooks, upper case letters denote random variables, and we will adhere to this convention. Lower case letters are used for observed values. You will see some notation that include both. For example, you will see events defined as \\(X \\leq x\\). Here \\(X\\) is a random variable and \\(x\\) is an arbitrary value and not random. So, for example, \\(X\\) might represent the number on a die roll and \\(x\\) will represent an actual value we see: 1, 2, 3, 4, 5, or 6. In this case, the probability of \\(X=x\\) is 1/6 regardless of the observed value \\(x\\).\nThis notation may seem a bit strange because when we inquire about probability, \\(X\\) is not an observed quantity; it’s a random quantity that we will encounter in the future. We can discuss what we expect \\(X\\) to be, what values are probable, but we can’t discuss what value \\(X\\) is. Once we have data, we do see a realization of \\(X\\). Therefore, data analysits often speak of what could have been after observing what actually happened."
   },
   {
     "objectID": "prob/random-variables-sampling-models-clt.html#the-expected-value-and-standard-error",
     "href": "prob/random-variables-sampling-models-clt.html#the-expected-value-and-standard-error",
     "title": "\n5  Random variables\n",
     "section": "\n5.6 The expected value and standard error",
-    "text": "5.6 The expected value and standard error\nWe have described sampling models for draws. We will now go over the mathematical theory that lets us approximate the probability distributions for the sum of draws. Once we do this, we will be able to help the casino predict how much money they will make. The same approach we use for the sum of draws will be useful for describing the distribution of averages and proportion which we will need to understand how polls work.\nThe first important concept to learn is the expected value. In statistics books, it is common to use letter \\(\\mbox{E}\\) like this:\n\\[\\mbox{E}[X]\\]\nto denote the expected value of the random variable \\(X\\).\nA random variable will vary around its expected value in a way that if you take the average of many, many draws, the average of the draws will approximate the expected value, getting closer and closer the more draws you take. This makes the expected value a useful quantity to compute.\nFor discrete random variable with possible outcomes \\(x_1,\\dots,x_n\\) the expected value is defined as\n\\[\n\\mbox{E}[X] = \\sum_{i=1}^n x_i \\,\\mbox{Pr}(X = x_i)\n\\] If \\(X\\) is a continuous random variable, with range of values \\(a\\) to \\(b\\) and probability density function \\(f(x)\\), this sum turns into an integral:\n\\[\n\\mbox{E}[X] = \\int_a^b x f(x)\n\\]\nNote that in the case that we are picking values from an un urn where each value \\(x_i\\) has an equal chance \\(1/n\\) of being selected the above equation is simply the average of the \\(x_i\\)s\n\\[\n\\mbox{E}[X] = \\frac{1}{n}\\sum_{i=1}^n x_i\n\\]\nIn the urn used to model betting on red in roulette, we have 20 one dollars and 18 negative one dollars so the expected value is:\n\\[\n\\mbox{E}[X] = (20 + -18)/38\n\\]\nwhich is about 5 cents. You may think it is a bit counter-intuitive to say that \\(X\\) varies around 0.05, when the only values it takes is 1 and -1. One way to make sense of the expected value in this context is by realizing that if we play the game over and over, the casino wins, on average, 5 cents per game. A Monte Carlo simulation confirms this:\n\nB &lt;- 10^6\nx &lt;- sample(c(-1, 1), B, replace = TRUE, prob = c(9/19, 10/19))\nmean(x)\n#&gt; [1] 0.0517\n\nIn general, if the urn has two possible outcomes, say \\(a\\) and \\(b\\), with proportions \\(p\\) and \\(1-p\\) respectively, the average is:\n\\[\\mbox{E}[X] = ap + b(1-p)\\]\nTo see this, notice that if there are \\(n\\) beads in the urn, then we have \\(np\\) \\(a\\)s and \\(n(1-p)\\) \\(b\\)s and because the average is the sum, \\(n\\times a \\times p + n\\times b \\times (1-p)\\), divided by the total \\(n\\), we get that the average is \\(ap + b(1-p)\\).\nNow the reason we define the expected value is because this mathematical definition turns out to be useful for approximating the probability distributions of sum, which then is useful for describing the distribution of averages and proportions. The first useful fact is that the expected value of the sum of the draws is the number of draws \\(\\times\\) the average of the numbers in the urn.\nSo if 1,000 people play roulette, the casino expects to win, on average, about 1,000 \\(\\times\\) $0.05 = $50. But this is an expected value. How different can one observation be from the expected value? The casino really needs to know this. What is the range of possibilities? If negative numbers are too likely, they will not install roulette wheels. Statistical theory once again answers this question. The standard error (SE) gives us an idea of the size of the variation around the expected value. In statistics books, it’s common to use:\n\\[\\mbox{SE}[X]\\]\nto denote the standard error of a random variable.\nFor discrete random variable with possible outcomes \\(x_1,\\dots,x_n\\) the standard error is defined as\n\\[\n\\mbox{SE}[X] = \\sqrt{\\sum_{i=1}^n \\left(x_i - E[X]\\right)^2 \\,\\mbox{Pr}(X = x_i)},\n\\] which you can think of as the expected average distance of \\(X\\) from the expected value.\nIf \\(X\\) is a continuous random variable, with range of values \\(a\\) to \\(b\\) and probability density function \\(f(x)\\), this sum turns into an integral:\n\\[\n\\mbox{SE}[X] = \\sqrt{\\int_a^b \\left(x-\\mbox{E}[X]\\right)^2 f(x)\\,\\mathrm{d}x}\n\\]\nNote that in the case that we are picking values from an un urn where each value \\(x_i\\) has an equal chance \\(1/n\\) of being selected the above equation is simply the standard deviation of of the \\(x_i\\)s\n\\[\n\\mbox{SE}[X] = \\sqrt{\\frac{1}{n}\\sum_{i=1}^n (x_i - \\bar{x})^2} \\mbox{ with } \\bar{x} =  \\frac{1}{n}\\sum_{i=1}^n x_i\n\\] Using the definition of standard deviation, we can derive, with a bit of math, that if an urn contains two values \\(a\\) and \\(b\\) with proportions \\(p\\) and \\((1-p)\\), respectively, the standard deviation is:\n\\[\\mid b - a \\mid \\sqrt{p(1-p)}.\\]\nSo in our roulette example, the standard deviation of the values inside the urn is: \\(\\mid 1 - (-1) \\mid \\sqrt{10/19 \\times 9/19}\\) or:\n\n2 * sqrt(90)/19\n#&gt; [1] 0.999\n\nThe standard error tells us the typical difference between a random variable and its expectation. Since one draw is obviously the sum of just one draw, we can use the formula above to calculate that the random variable defined by one draw has an expected value of 0.05 and a standard error of about 1. This makes sense since we either get 1 or -1, with 1 slightly favored over -1.\nA widely used mathematical results is tha if our draws are independent, then the standard error of the sum is given by the equation:\n\\[\n\\sqrt{\\mbox{number of draws}} \\times \\mbox{ standard deviation of the numbers in the urn}\n\\]\nUsing this formula, the sum of 1,000 people playing has standard error of about $32:\n\nn &lt;- 1000\nsqrt(n) * 2 * sqrt(90)/19\n#&gt; [1] 31.6\n\nAs a result, when 1,000 people bet on red, the casino is expected to win $50 with a standard error of $32. It therefore seems like a safe bet. But we still haven’t answered the question: how likely is it to lose money? Here the CLT will help.\n\n\n\n\n\n\nThe exact probability for the casino winnings can be computed exactly, rather than an approximation, using the binomial distribution. However, here we focus on the CLT, which can be generally applied to sums of random variables in a way that the binomial distribution can’t."
+    "text": "5.6 The expected value and standard error\nWe have described sampling models for draws. We will now review the mathematical theory that allows us to approximate the probability distributions for the sum of draws. Once we do this, we will be able to help the casino predict how much money they will make. The same approach we use for the sum of draws will be useful for describing the distribution of averages and proportion, which we will need to understand how polls work.\nThe first important concept to learn is the expected value. In statistics books, it is common to represent it with the letter \\(\\mbox{E}\\) like this:\n\\[\\mbox{E}[X]\\]\nto denote the expected value of the random variable \\(X\\).\nA random variable will vary around its expected value in a manner that if you take the average of many, many draws, the average will approximate the expected value. This approximation improves as you take more draws, making the expected value a useful quantity to compute.\nFor discrete random variable with possible outcomes \\(x_1,\\dots,x_n\\), the expected value is defined as:\n\\[\n\\mbox{E}[X] = \\sum_{i=1}^n x_i \\,\\mbox{Pr}(X = x_i)\n\\] If \\(X\\) is a continuous random variable with a range of values \\(a\\) to \\(b\\) and a probability density function \\(f(x)\\), this sum transforms into an integral:\n\\[\n\\mbox{E}[X] = \\int_a^b x f(x)\n\\]\nNote that in the case that we are picking values from an urn, and each value \\(x_i\\) has an equal chance \\(1/n\\) of being selected, the above equation is simply the average of the \\(x_i\\)s.\n\\[\n\\mbox{E}[X] = \\frac{1}{n}\\sum_{i=1}^n x_i\n\\]\nIn the urn used to model betting on red in roulette, we have 20 one-dollar bills and 18 negative one-dollar bills, so the expected value is:\n\\[\n\\mbox{E}[X] = (20 + -18)/38\n\\]\nwhich is about 5 cents. You might consider it a bit counterintuitive to say that \\(X\\) varies around 0.05 when it only takes the values 1 and -1. One way to make sense of the expected value in this context is by realizing that, if we play the game over and over, the casino wins, on average, 5 cents per game. A Monte Carlo simulation confirms this:\n\nB &lt;- 10^6\nx &lt;- sample(c(-1, 1), B, replace = TRUE, prob = c(9/19, 10/19))\nmean(x)\n#&gt; [1] 0.0517\n\nIn general, if the urn has two possible outcomes, say \\(a\\) and \\(b\\), with proportions \\(p\\) and \\(1-p\\) respectively, the average is:\n\\[\\mbox{E}[X] = ap + b(1-p)\\]\nTo confirm this, observe that if there are \\(n\\) beads in the urn, then we have \\(np\\) \\(a\\)s and \\(n(1-p)\\) \\(b\\)s, and because the average is the sum, \\(n\\times a \\times p + n\\times b \\times (1-p)\\), divided by the total \\(n\\), we get that the average is \\(ap + b(1-p)\\).\nNow, the reason we define the expected value is because this mathematical definition turns out to be useful for approximating the probability distributions of sum. This, in turn, is useful for describing the distribution of averages and proportions. The first useful fact is that the expected value of the sum of the draws is the number of draws \\(\\times\\) the average of the numbers in the urn.\nTherefore, if 1,000 people play roulette, the casino expects to win, on average, about 1,000 \\(\\times\\) $0.05 = $50. However, this is an expected value. How different can one observation be from the expected value? The casino really needs to know this. What is the range of possibilities? If negative numbers are too likely, they will not install roulette wheels. Statistical theory once again answers this question. The standard error (SE) gives us an idea of the size of the variation around the expected value. In statistics books, it’s common to use:\n\\[\\mbox{SE}[X]\\]\nto denote the standard error of a random variable.\nFor discrete random variable with possible outcomes \\(x_1,\\dots,x_n\\), the standard error is defined as:\n\\[\n\\mbox{SE}[X] = \\sqrt{\\sum_{i=1}^n \\left(x_i - E[X]\\right)^2 \\,\\mbox{Pr}(X = x_i)},\n\\] which you can think of as the expected average distance of \\(X\\) from the expected value.\nIf \\(X\\) is a continuous random variable, with range of values \\(a\\) to \\(b\\) and probability density function \\(f(x)\\), this sum turns into an integral:\n\\[\n\\mbox{SE}[X] = \\sqrt{\\int_a^b \\left(x-\\mbox{E}[X]\\right)^2 f(x)\\,\\mathrm{d}x}\n\\]\nNote that in the case that we are picking values from an un urn where each value \\(x_i\\) has an equal chance \\(1/n\\) of being selected, the above equation is simply the standard deviation of of the \\(x_i\\)s.\n\\[\n\\mbox{SE}[X] = \\sqrt{\\frac{1}{n}\\sum_{i=1}^n (x_i - \\bar{x})^2} \\mbox{ with } \\bar{x} =  \\frac{1}{n}\\sum_{i=1}^n x_i\n\\] Using the definition of standard deviation, we can derive, with a bit of math, that if an urn contains two values \\(a\\) and \\(b\\) with proportions \\(p\\) and \\((1-p)\\), respectively, the standard deviation is:\n\\[\\mid b - a \\mid \\sqrt{p(1-p)}.\\]\nSo in our roulette example, the standard deviation of the values inside the urn is \\(\\mid 1 - (-1) \\mid \\sqrt{10/19 \\times 9/19}\\) or:\n\n2*sqrt(90)/19\n#&gt; [1] 0.999\n\nThe standard error tells us the typical difference between a random variable and its expectation. Since one draw is obviously the sum of just one draw, we can use the formula above to calculate that the random variable defined by one draw has an expected value of 0.05 and a standard error of about 1. This makes sense since we obtain either 1 or -1, with 1 slightly favored over -1.\nA widely used mathematical result is that if our draws are independent, then the standard error of the sum is given by the equation:\n\\[\n\\sqrt{\\mbox{number of draws}} \\times \\mbox{ standard deviation of the numbers in the urn}\n\\]\nUsing this formula, the sum of 1,000 people playing has standard error of about $32:\n\nn &lt;- 1000\nsqrt(n)*2*sqrt(90)/19\n#&gt; [1] 31.6\n\nAs a result, when 1,000 people bet on red, the casino is expected to win $50 with a standard error of $32. It therefore seems like a safe bet to install more roulette wheels. But we still haven’t answered the question: How likely is the casino to lose money? The CLT will help in this regard.\n\n\n\n\n\n\nThe exact probability for the casino winnings can be computed precisely, rather than approximately, using the binomial distribution. However, here we focus on the CLT, which can be applied more broadly to sums of random variables in a way that the binomial distribution cannot."
   },
   {
     "objectID": "prob/random-variables-sampling-models-clt.html#central-limit-theorem",
     "href": "prob/random-variables-sampling-models-clt.html#central-limit-theorem",
     "title": "\n5  Random variables\n",
     "section": "\n5.7 Central Limit Theorem",
-    "text": "5.7 Central Limit Theorem\nThe Central Limit Theorem (CLT) tells us that when the number of draws, also called the sample size, is large, the probability distribution of the sum of the independent draws is approximately normal. Because sampling models are used for so many data generation processes, the CLT is considered one of the most important mathematical insights in history.\nPreviously, we discussed that if we know that the distribution of a list of numbers is approximated by the normal distribution, all we need to describe the list are the average and standard deviation. We also know that the same applies to probability distributions. If a random variable has a probability distribution that is approximated with the normal distribution, then all we need to describe the probability distribution are the average and standard deviation, referred to as the expected value and standard error.\nWe previously ran this Monte Carlo simulation:\n\nn &lt;- 1000\nB &lt;- 10000\nroulette_winnings &lt;- function(n){\n  X &lt;- sample(c(-1,1), n, replace = TRUE, prob = c(9/19, 10/19))\n  sum(X)\n}\nS &lt;- replicate(B, roulette_winnings(n))\n\nThe Central Limit Theorem (CLT) tells us that the sum \\(S\\) is approximated by a normal distribution. Using the formulas above, we know that the expected value and standard error are:\n\nn * (20 - 18)/38 \n#&gt; [1] 52.6\nsqrt(n)*2*sqrt(90)/19 \n#&gt; [1] 31.6\n\nThe theoretical values above match those obtained with the Monte Carlo simulation:\n\nmean(S)\n#&gt; [1] 52.2\nsd(S)\n#&gt; [1] 31.7\n\nUsing the CLT, we can skip the Monte Carlo simulation and instead compute the probability of the casino losing money using this approximation:\n\nmu &lt;- n * (20 - 18)/38\nse &lt;-  sqrt(n)*2*sqrt(90)/19 \npnorm(0, mu, se)\n#&gt; [1] 0.0478\n\nwhich is also in very good agreement with our Monte Carlo result:\n\nmean(S &lt; 0)\n#&gt; [1] 0.0458\n\n\n5.7.1 How large is large in the Central Limit Theorem?\nThe CLT works when the number of draws is large. But large is a relative term. In many circumstances as few as 30 draws is enough to make the CLT useful. In some specific instances, as few as 10 is enough. However, these should not be considered general rules. Note, for example, that when the probability of success is very small, we need much larger sample sizes.\nBy way of illustration, let’s consider the lottery. In the lottery, the chances of winning are less than 1 in a million. Thousands of people play so the number of draws is very large. Yet the number of winners, the sum of the draws, range between 0 and 4. This sum is certainly not well approximated by a normal distribution, so the CLT does not apply, even with the very large sample size. This is generally true when the probability of a success is very low. In these cases, the Poisson distribution is more appropriate.\nYou can examine the properties of the Poisson distribution using dpois and ppois. You can generate random variables following this distribution with rpois. However, we do not cover the theory here. You can learn about the Poisson distribution in any probability textbook and even Wikipedia4"
+    "text": "5.7 Central Limit Theorem\nThe Central Limit Theorem (CLT) tells us that when the number of draws, also called the sample size, is large, the probability distribution of the sum of the independent draws is approximately normal. Given that sampling models are used for so many data generation processes, the CLT is considered one of the most important mathematical insights in history.\nPreviously, we discussed that if we know that the distribution of a list of numbers is approximated by the normal distribution, all we need to describe the list are the average and standard deviation. We also know that the same applies to probability distributions. If a random variable has a probability distribution that is approximated with the normal distribution, then all we need to describe the probability distribution are the average and standard deviation, referred to as the expected value and standard error.\nWe previously ran this Monte Carlo simulation:\n\nn &lt;- 1000\nB &lt;- 10000\nroulette_winnings &lt;- function(n){\n  X &lt;- sample(c(-1, 1), n, replace = TRUE, prob = c(9/19, 10/19))\n  sum(X)\n}\nS &lt;- replicate(B, roulette_winnings(n))\n\nThe Central Limit Theorem (CLT) tells us that the sum \\(S\\) is approximated by a normal distribution. Using the formulas above, we know that the expected value and standard error are:\n\nn * (20 - 18)/38 \n#&gt; [1] 52.6\nsqrt(n)*2*sqrt(90)/19 \n#&gt; [1] 31.6\n\nThe theoretical values above match those obtained with the Monte Carlo simulation:\n\nmean(S)\n#&gt; [1] 52.2\nsd(S)\n#&gt; [1] 31.7\n\nUsing the CLT, we can skip the Monte Carlo simulation and instead compute the probability of the casino losing money using this approximation:\n\nmu &lt;- n*(20 - 18)/38\nse &lt;- sqrt(n)*2*sqrt(90)/19 \npnorm(0, mu, se)\n#&gt; [1] 0.0478\n\nwhich is also in very good agreement with our Monte Carlo result:\n\nmean(S &lt; 0)\n#&gt; [1] 0.0458\n\n\n5.7.1 How large is large in the Central Limit Theorem?\nThe CLT works when the number of draws is large, but “large” is a relative term. In many circumstances, as few as 30 draws is enough to make the CLT useful. In some specific instances, as few as 10 is enough. However, these should not be considered general rules. Note, for example, that when the probability of success is very small, much larger sample sizes are needed.\nBy way of illustration, let’s consider the lottery. In the lottery, the chances of winning are less than 1 in a million. Thousands of people play so the number of draws is very large. Yet the number of winners, the sum of the draws, range between 0 and 4. This sum is certainly not well approximated by a normal distribution, so the CLT does not apply, even with the very large sample size. This is generally true when the probability of a success is very low. In these cases, the Poisson distribution is more appropriate.\nYou can explore the properties of the Poisson distribution using dpois and ppois. You can generate random variables following this distribution with rpois. However, we won’t cover the theory here. You can learn about the Poisson distribution in any probability textbook and even Wikipedia4."
   },
   {
     "objectID": "prob/random-variables-sampling-models-clt.html#statistical-properties-of-averages",
     "href": "prob/random-variables-sampling-models-clt.html#statistical-properties-of-averages",
     "title": "\n5  Random variables\n",
     "section": "\n5.8 Statistical properties of averages",
-    "text": "5.8 Statistical properties of averages\nThere are several useful mathematical results that we used above and often employ when working with data. We list them below.\n1. The expected value of the sum of random variables is the sum of each random variable’s expected value. We can write it like this:\n\\[\n\\mbox{E}[X_1+X_2+\\dots+X_n] =  \\mbox{E}[X_1] + \\mbox{E}[X_2]+\\dots+\\mbox{E}[X_n]\n\\]\nIf the \\(X\\) are independent draws from the urn, then they all have the same expected value. Let’s call it \\(\\mu\\) and thus:\n\\[\n\\mbox{E}[X_1+X_2+\\dots+X_n]=  n\\mu\n\\]\nwhich is another way of writing the result we show above for the sum of draws.\n2. The expected value of a non-random constant times a random variable is the non-random constant times the expected value of a random variable. This is easier to explain with symbols:\n\\[\n\\mbox{E}[aX] =  a\\times\\mbox{E}[X]\n\\]\nTo see why this is intuitive, consider change of units. If we change the units of a random variable, say from dollars to cents, the expectation should change in the same way. A consequence of the above two facts is that the expected value of the average of independent draws from the same urn is the expected value of the urn, call it \\(\\mu\\) again:\n\\[\n\\mbox{E}[(X_1+X_2+\\dots+X_n) / n]=   \\mbox{E}[X_1+X_2+\\dots+X_n] / n = n\\mu/n = \\mu\n\\]\n3. The square of the standard error of the sum of independent random variables is the sum of the square of the standard error of each random variable. This one is easier to understand in math form:\n\\[\n\\mbox{SE}[X_1+X_2+\\dots+X_n] = \\sqrt{\\mbox{SE}[X_1]^2 + \\mbox{SE}[X_2]^2+\\dots+\\mbox{SE}[X_n]^2  }\n\\]\nThe square of the standard error is referred to as the variance in statistical textbooks. Note that this particular property is not as intuitive as the previous three and more in depth explanations can be found in statistics textbooks.\n4. The standard error of a non-random constant times a random variable is the non-random constant times the random variable’s standard error. As with the expectation:\n\\[\n\\mbox{SE}[aX] =  a \\times \\mbox{SE}[X]\n\\]\nTo see why this is intuitive, again think of units.\nA consequence of 3 and 4 is that the standard error of the average of independent draws from the same urn is the standard deviation of the urn divided by the square root of \\(n\\) (the number of draws), call it \\(\\sigma\\):\n\\[\n\\begin{aligned}\n\\mbox{SE}[(X_1+X_2+\\dots+X_n) / n] &=   \\mbox{SE}[X_1+X_2+\\dots+X_n]/n \\\\\n&= \\sqrt{\\mbox{SE}[X_1]^2+\\mbox{SE}[X_2]^2+\\dots+\\mbox{SE}[X_n]^2}/n \\\\\n&= \\sqrt{\\sigma^2+\\sigma^2+\\dots+\\sigma^2}/n\\\\\n&= \\sqrt{n\\sigma^2}/n\\\\\n&= \\sigma / \\sqrt{n}    \n\\end{aligned}\n\\]\n5. If \\(X\\) is a normally distributed random variable, then if \\(a\\) and \\(b\\) are non-random constants, \\(aX + b\\) is also a normally distributed random variable. All we are doing is changing the units of the random variable by multiplying by \\(a\\), then shifting the center by \\(b\\).\nNote that statistical textbooks use the Greek letters \\(\\mu\\) and \\(\\sigma\\) to denote the expected value and standard error, respectively. This is because \\(\\mu\\) is the Greek letter for \\(m\\), the first letter of mean, which is another term used for expected value. Similarly, \\(\\sigma\\) is the Greek letter for \\(s\\), the first letter of standard error.\n\n\n\n\n\n\nThe assumption of independence is important\n\n\n\nLet’s make the explanation more concise and clear:\nThe given equation reveals crucial insights for practical scenarios. Specifically, it suggests that the standard error can be minimized by increasing the sample size, \\(n\\), and we can quantify this reduction. However, this principle holds true only when the variables \\(X_1, X_2, ... X_n\\) are independent. If they are not, the estimated standard error can be significantly off.\nIn Section Section 13.2, we introduce the concept of correlation, which quantifies the degree to which variables are interdependent. If the correlation coefficient among the ( X ) variables is ( ), the standard error of their average is:\n\\[\n\\mbox{SE}\\left(\\bar{X}\\right) = \\sigma \\sqrt{\\frac{1 + (n-1) \\rho}{n}}\n\\]\nThe key observation here is that as \\(\\rho\\) approaches its upper limit of 1, the standard error increases. Notably, in the situation where \\(\\rho = 1\\), the standard error, \\(\\mbox{SE}(\\bar{X})\\), equals \\(\\sigma\\), and it becomes unaffected by the sample size \\(n\\).\n\n\n\n5.8.1 Law of large numbers\nAn important implication of the result 5 above is that the standard error of the average becomes smaller and smaller as \\(n\\) grows larger. When \\(n\\) is very large, then the standard error is practically 0 and the average of the draws converges to the average of the urn. This is known in statistical textbooks as the law of large numbers or the law of averages.\n\n\n\n\n\n\nMisinterpretation of the law of averages\n\n\n\nThe law of averages is sometimes misinterpreted. For example, if you toss a coin 5 times and see a head each time, you might hear someone argue that the next toss is probably a tail because of the law of averages: on average we should see 50% heads and 50% tails. A similar argument would be to say that red “is due” on the roulette wheel after seeing black come up five times in a row. These events are independent so the chance of a coin landing heads is 50% regardless of the previous 5. This is also the case for the roulette outcome. The law of averages applies only when the number of draws is very large and not in small samples. After a million tosses, you will definitely see about 50% heads regardless of the outcome of the first five tosses. Another funny misuse of the law of averages is in sports when TV sportscasters predict a player is about to succeed because they have failed a few times in a row."
+    "text": "5.8 Statistical properties of averages\nThere are several useful mathematical results that we used above and often employ when working with data. We list them below.\n1. The expected value of the sum of random variables is the sum of each random variable’s expected value. We can write it like this:\n\\[\n\\mbox{E}[X_1+X_2+\\dots+X_n] =  \\mbox{E}[X_1] + \\mbox{E}[X_2]+\\dots+\\mbox{E}[X_n]\n\\]\nIf \\(X\\) represents independent draws from the urn, then they all have the same expected value. Let’s denote the expected value with \\(\\mu\\) and rewrite the equation as:\n\\[\n\\mbox{E}[X_1+X_2+\\dots+X_n]=  n\\mu\n\\]\nwhich is another way of writing the result we show above for the sum of draws.\n2. The expected value of a non-random constant times a random variable is the non-random constant times the expected value of a random variable. This is easier to explain with symbols:\n\\[\n\\mbox{E}[aX] =  a\\times\\mbox{E}[X]\n\\]\nTo understand why this is intuitive, consider changing units. If we change the units of a random variable, such as from dollars to cents, the expectation should change in the same way. A consequence of the above two facts is that the expected value of the average of independent draws from the same urn is the expected value of the urn, denoted as \\(\\mu\\) again:\n\\[\n\\mbox{E}[(X_1+X_2+\\dots+X_n) / n]=   \\mbox{E}[X_1+X_2+\\dots+X_n] / n = n\\mu/n = \\mu\n\\]\n3. The square of the standard error of the sum of independent random variables is the sum of the square of the standard error of each random variable. This one is easier to understand in math form:\n\\[\n\\mbox{SE}[X_1+X_2+\\dots+X_n] = \\sqrt{\\mbox{SE}[X_1]^2 + \\mbox{SE}[X_2]^2+\\dots+\\mbox{SE}[X_n]^2  }\n\\]\nThe square of the standard error is referred to as the variance in statistical textbooks. Note that this particular property is not as intuitive as the previous three and more in depth explanations can be found in statistics textbooks.\n4. The standard error of a non-random constant times a random variable is the non-random constant times the random variable’s standard error. As with the expectation:\n\\[\n\\mbox{SE}[aX] =  a \\times \\mbox{SE}[X]\n\\]\nTo see why this is intuitive, again think of units.\nA consequence of 3 and 4 is that the standard error of the average of independent draws from the same urn is the standard deviation of the urn divided by the square root of \\(n\\) (the number of draws), call it \\(\\sigma\\):\n\\[\n\\begin{aligned}\n\\mbox{SE}[(X_1+X_2+\\dots+X_n) / n] &=   \\mbox{SE}[X_1+X_2+\\dots+X_n]/n \\\\\n&= \\sqrt{\\mbox{SE}[X_1]^2+\\mbox{SE}[X_2]^2+\\dots+\\mbox{SE}[X_n]^2}/n \\\\\n&= \\sqrt{\\sigma^2+\\sigma^2+\\dots+\\sigma^2}/n\\\\\n&= \\sqrt{n\\sigma^2}/n\\\\\n&= \\sigma / \\sqrt{n}    \n\\end{aligned}\n\\]\n5. If \\(X\\) is a normally distributed random variable, then if \\(a\\) and \\(b\\) are non-random constants, \\(aX + b\\) is also a normally distributed random variable. All we are doing is changing the units of the random variable by multiplying by \\(a\\), then shifting the center by \\(b\\).\nNote that statistical textbooks use the Greek letters \\(\\mu\\) and \\(\\sigma\\) to denote the expected value and standard error, respectively. This is because \\(\\mu\\) is the Greek letter for \\(m\\), the first letter of mean, which is another term used for expected value. Similarly, \\(\\sigma\\) is the Greek letter for \\(s\\), the first letter of standard error.\n\n\n\n\n\n\nThe assumption of independence is important\n\n\n\nLet’s make the explanation more concise and clear:\nThe given equation reveals crucial insights for practical scenarios. Specifically, it suggests that the standard error can be minimized by increasing the sample size, \\(n\\), and we can quantify this reduction. However, this principle holds true only when the variables \\(X_1, X_2, ... X_n\\) are independent. If they are not, the estimated standard error can be significantly off.\nIn Section 14.2, we introduce the concept of correlation, which quantifies the degree to which variables are interdependent. If the correlation coefficient among the ( X ) variables is ( ), the standard error of their average is:\n\\[\n\\mbox{SE}\\left(\\bar{X}\\right) = \\sigma \\sqrt{\\frac{1 + (n-1) \\rho}{n}}\n\\]\nThe key observation here is that as \\(\\rho\\) approaches its upper limit of 1, the standard error increases. Notably, in the situation where \\(\\rho = 1\\), the standard error, \\(\\mbox{SE}(\\bar{X})\\), equals \\(\\sigma\\), and it becomes unaffected by the sample size \\(n\\).\n\n\n\n5.8.1 Law of large numbers\nAn important implication of result 5 above is that the standard error of the average becomes smaller and smaller as \\(n\\) grows larger. When \\(n\\) is very large, then the standard error is practically 0 and the average of the draws converges to the average of the urn. This is known in statistical textbooks as the law of large numbers or the law of averages.\n\n\n\n\n\n\nMisinterpretation of the law of averages\n\n\n\nThe law of averages is sometimes misinterpreted. For example, if you toss a coin 5 times and see a head each time, you might hear someone argue that the next toss is probably a tail because of the law of averages: on average we should see 50% heads and 50% tails. A similar argument would be to say that red “is due” on the roulette wheel after seeing black come up five times in a row. Yet these events are independent so the chance of a coin landing heads is 50%, regardless of the previous 5. The same principle applies to the roulette outcome. The law of averages applies only when the number of draws is very large and not in small samples. After a million tosses, you will definitely see about 50% heads regardless of the outcome of the first five tosses. Another funny misuse of the law of averages is in sports when TV sportscasters predict a player is about to succeed because they have failed a few times in a row."
   },
   {
     "objectID": "prob/random-variables-sampling-models-clt.html#exercises",
     "href": "prob/random-variables-sampling-models-clt.html#exercises",
     "title": "\n5  Random variables\n",
     "section": "\n5.9 Exercises",
-    "text": "5.9 Exercises\n1. In American Roulette you can also bet on green. There are 18 reds, 18 blacks and 2 greens (0 and 00). What are the chances the green comes out?\n2. The payout for winning on green is $17 dollars. This means that if you bet a dollar and it lands on green, you get $17. Create a sampling model using sample to simulate the random variable \\(X\\) for your winnings. Hint: see the example below for how it should look like when betting on red.\n\nx &lt;- sample(c(1,-1), 1, prob = c(9/19, 10/19))\n\n3. Compute the expected value of \\(X\\).\n4. Compute the standard error of \\(X\\).\n5. Now create a random variable \\(S\\) that is the sum of your winnings after betting on green 1000 times. Hint: change the argument size and replace in your answer to question 2. Start your code by setting the seed to 1 with set.seed(1).\n6. What is the expected value of \\(S\\)?\n7. What is the standard error of \\(S\\)?\n8. What is the probability that you end up winning money? Hint: use the CLT.\n9. Create a Monte Carlo simulation that generates 1,000 outcomes of \\(S\\). Compute the average and standard deviation of the resulting list to confirm the results of 6 and 7. Start your code by setting the seed to 1 with set.seed(1).\n10. Now check your answer to 8 using the Monte Carlo result.\n11. The Monte Carlo result and the CLT approximation are close, but not that close. What could account for this?\n\n1,000 simulations is not enough. If we do more, they match.\nThe CLT does not work as well when the probability of success is small. In this case, it was 1/19. If we make the number of roulette plays bigger, they will match better.\nThe difference is within rounding error.\nThe CLT only works for averages.\n\n12. Now create a random variable \\(Y\\) that is your average winnings per bet after playing off your winnings after betting on green 1,000 times.\n13. What is the expected value of \\(Y\\)?\n14. What is the standard error of \\(Y\\)?\n15. What is the probability that you end up with winnings per game that are positive? Hint: use the CLT.\n16. Create a Monte Carlo simulation that generates 2,500 outcomes of \\(Y\\). Compute the average and standard deviation of the resulting list to confirm the results of 13 and 14. Start your code by setting the seed to 1 with set.seed(1).\n17. Now compare your answer to 15 using the Monte Carlo result.\n18. The Monte Carlo result and the CLT approximation are now much closer. What could account for this?\n\nWe are now computing averages instead of sums.\n2,500 Monte Carlo simulations is not better than 1,000.\nThe CLT works better when the sample size is larger. We increased from 1,000 to 2,500.\nIt is not closer. The difference is within rounding error.\n\n19. More complex versions of the sampling models we have discussed are also used by banks to decide interest rates and insurance companies to decide on premiums. To understand this, suppose you run a small bank that has a history of identifying potential homeowners that can be trusted to make payments. In fact, historically, in a given year, only 2% of your customers default, meaning that they don’t pay back the money that you lent them. Suppose your bank will give out $n=$1,000 loans for $180,000 this year. Also, after adding up all costs, suppose your bank loses \\(l\\)=$200,000 per foreclosure. For simplicity, we assume this includes all operational costs. What is the expected profit \\(S\\) for you bank under this scenario?\n20. Note that the total loss defined by the final sum in the previous exercise is a random variable. Every time you run the sampling model code, you get a different number of people default resulting in a different loss. Code a sampling model for the random variable representing your banks profit \\(S\\) under scenario described in 19.\n21. The previous exercise demonstrates that if you simply loan money to everybody without interest, you will end up losing money due to the 2% that defaults. Although you know 2% of your clients will probably default, you don’t know which ones, so you can’t remove them. Yet by charging everybody just a bit extra in interest, you can make up the losses incurred due to that 2% and also cover your operating costs. What quantity \\(x\\) would you have to charge each borrower so that your bank’s expected profit is 0? Assume that you don’t get \\(x\\) from the borrowers that default. Also note \\(x\\) is not the interest rate but the total you add. Can we refer to \\(x\\) divided by the size (\\(x/180000\\)) as the interest rate.\n22. Rewrite the sample modelfrom 20 and run a Monte Carlo simulation to get an idea of the distribution of your profit when you charge interest rates.\n23. We don’t really need a Monte Carlo simulation though. Using what we have learned, the CLT tells us that because our losses are a sum of independent draws, its distribution is approximately normal. What are the expected value and standard errors of the profit \\(S\\)? Write these as functions of the probability of foreclosure \\(p\\), the number of loans \\(n\\), the loss per foreclosure \\(l\\), and the quantity you charge each borrower \\(x\\).\n24. If you set \\(x\\) to assure your bank breaks even (expected profit is 0), what is the probability that your bank loses money?\n25. Suppose that if your bank has negative profit it has to close. You therefore need to increase \\(x\\) to minimize this risk. However, if you set the interest rates too high, your clients will go to another bank. So let’s say that we want our chances of losing money to be 1 in 100, what does the \\(x\\) quantity need to be now? Hint: We want \\(\\mbox{Pr}(S&lt;0) = 0.01\\). Note that you can add subtract constants to both side of an inequality and the probability does not change: \\(\\mbox{Pr}(S&lt;0) = \\mbox{Pr}(S+k&lt;0+k)\\), Similarly, with division of positive constants: \\(\\mbox{Pr}(S+k&lt;0+k) = \\mbox{Pr}((S+k)/m &lt;k/m)\\). Use this fact and the CLT to transform the left side of the inequality in \\(\\mbox{Pr}(S&lt;0)\\) into a standard normal.\n26. Our interest rate now goes up. But it is still a very competitive interest rate. For the \\(x\\) you obtained in 25, what is expected profit per loan and the expected total profit?\n27. Run run a Monte Carlo simulation to double check the theoretical approximation used in 25 and 26.\n28. One of your employees points out that since the bank is making a profit per loan, the bank should give out more loans! Why just \\(n\\)? You explain that finding those \\(n\\) clients was hard. You need a group that is predictable and that keeps the chances of defaults low. He then points out that even if the probability of default is higher, as long as our expected value is positive, you can minimize your chances of losses by increasing \\(n\\) and relying on the law of large numbers. Suppose the default probability is twice as high, or 4%, and you set the interest rate to 5%, or \\(x=\\)$9,000, what is your expected profit per loan?\n29. How much do we have to increase \\(n\\) by to assure the probability of losing money is still less than 0.01?\n30. Confirm the result in 29 with a Monte Carlo simulations.\n31. According to this equation, giving out more loans increases your expected profit and lowers the chances of losing money! Giving out more loans seems like a no brainier. As a result, your colleague decides to leave your bank and start his own high-risk mortgage company. A few months later, your colleague’s bank has gone bankrupt. A book is written and eventually the movies The Big Short and Margin Call are made relating the mistake your friend, and many others, made. What happened?\nYour colleague’s scheme was mainly based on this mathematical formula \\(\\mbox{SE}\\left(\\bar{X}\\right) = \\sigma / \\sqrt{n}\\). By making \\(n\\) large, we minimize the standard error of our per-loan profit. However, for this rule to hold, the \\(X\\)s must be independent draws: one person defaulting must be independent of others defaulting.\nTo construct a more realistic simulation than the original one your colleague ran, let’s assume there is a global event that affects everybody with high-risk mortgages and changes their probability. We will assume that with 50-50 chance, all the probabilities go up or down slightly to somewhere between 0.03 and 0.05. But it happens to everybody at once, not just one person. These draws are no longer independent so our equation does not apply. Write a Monte Carlo simulation for your total profit with this model.\n32. Use the simulation results to report the expected profit, the probability of losing money, and the probability of losing more than $10,000,000. Study the distribution of profit and discuss how making the wrong assumption lead to a catastrophic result,"
+    "text": "5.9 Exercises\n1. In American Roulette, you can also bet on green. There are 18 reds, 18 blacks and 2 greens (0 and 00). What are the chances the green comes out?\n2. The payout for winning on green is $17 dollars. This means that if you bet a dollar and it lands on green, you get $17. Create a sampling model using sample to simulate the random variable \\(X\\) for your winnings. Hint: Refer to the example below for how it should look like when betting on red.\n\nx &lt;- sample(c(1, -1), 1, prob = c(9/19, 10/19))\n\n3. Compute the expected value of \\(X\\).\n4. Compute the standard error of \\(X\\).\n5. Now create a random variable \\(S\\) that is the sum of your winnings after betting on green 1000 times. Hint: change the argument size and replace in your answer to exercise 2. Start your code by setting the seed to 1 with set.seed(1).\n6. What is the expected value of \\(S\\)?\n7. What is the standard error of \\(S\\)?\n8. What is the probability that you end up winning money? Hint: Use the CLT.\n9. Create a Monte Carlo simulation that generates 1,000 outcomes of \\(S\\). Compute the average and standard deviation of the resulting list to confirm the results of 6 and 7. Start your code by setting the seed to 1 with set.seed(1).\n10. Now check your answer to 8 using the Monte Carlo result.\n11. The Monte Carlo result and the CLT approximation are close, but not that close. What could account for this?\n\n1,000 simulations is not enough. If we do more, they match.\nThe CLT does not work as well when the probability of success is small. In this case, it was 1/19. If we make the number of roulette plays bigger, they will match better.\nThe difference is within rounding error.\nThe CLT only works for averages.\n\n12. Now create a random variable \\(Y\\) that is your average winnings per bet, after playing off your winnings after betting on green 1,000 times.\n13. What is the expected value of \\(Y\\)?\n14. What is the standard error of \\(Y\\)?\n15. What is the probability that you end up with winnings per game that are positive? Hint: Use the CLT.\n16. Create a Monte Carlo simulation that generates 2,500 outcomes of \\(Y\\). Compute the average and standard deviation of the resulting list to confirm the results of 13 and 14. Start your code by setting the seed to 1 with set.seed(1).\n17. Now compare your answer to 15 using the Monte Carlo result.\n18. The Monte Carlo result and the CLT approximation are now much closer. What could account for this?\n\nWe are now computing averages instead of sums.\n2,500 Monte Carlo simulations is not better than 1,000.\nThe CLT works better when the sample size is larger. We increased from 1,000 to 2,500.\nIt is not closer. The difference is within rounding error.\n\n19. More complex versions of the sampling models we have discussed are also used by banks to determine interest rates and insurance companies to determine premiums. To understand this, suppose you run a small bank that has a history of identifying potential homeowners that can be trusted to make payments. In fact, historically, only 2% of your customers default in a given year, meaning that they don’t pay back the money that you lent them. Suppose your bank will give out $n=$1,000 loans for $180,000 this year. Also, after adding up all costs, suppose your bank loses \\(l\\)=$200,000 per foreclosure. For simplicity, we assume this includes all operational costs. What is the expected profit \\(S\\) for you bank under this scenario?\n20. Note that the total loss defined by the final sum in the previous exercise is a random variable. Every time you run the sampling model code, you obtain a different number of people defaulting which results in a different loss. Code a sampling model for the random variable representing your banks profit \\(S\\) under scenario described in 19.\n21. The previous exercise demonstrates that if you simply loan money to everybody without interest, you will end up losing money due to the 2% that defaults. Although you know 2% of your clients will probably default, you don’t know which ones, so you can’t remove them. Yet by charging everybody just a bit extra in interest, you can make up the losses incurred due to that 2%, and also cover your operating costs. What quantity \\(x\\) would you have to charge each borrower so that your bank’s expected profit is 0? Assume that you don’t get \\(x\\) from the borrowers that default. Also, note \\(x\\) is not the interest rate, but the total you add meaning \\(x/180000\\) is the interest rate.\n22. Rewrite the sample model from exercise 20 and run a Monte Carlo simulation to get an idea of the distribution of your profit when you charge interest rates.\n23. We don’t actually need a Monte Carlo simulation. Based on what we have learned, the CLT informs us that, since our losses are a sum of independent draws, its distribution is approximately normal. What are the expected value and standard errors of the profit \\(S\\)? Write these as functions of the probability of foreclosure \\(p\\), the number of loans \\(n\\), the loss per foreclosure \\(l\\), and the quantity you charge each borrower \\(x\\).\n24. If you set \\(x\\) to assure your bank breaks even (expected profit is 0), what is the probability that your bank loses money?\n25. Suppose that if your bank has negative profit, it has to close. Therefore, you need to increase \\(x\\) to minimize this risk. However, setting the interest rates too high may lead your clients to choose another bank. So, let’s say that we want our chances of losing money to be 1 in 100. What does the \\(x\\) quantity need to be now? Hint: We want \\(\\mbox{Pr}(S&lt;0) = 0.01\\). Note that you can add subtract constants to both side of an inequality, and the probability does not change: \\(\\mbox{Pr}(S&lt;0) = \\mbox{Pr}(S+k&lt;0+k)\\), Similarly, with division of positive constants: \\(\\mbox{Pr}(S+k&lt;0+k) = \\mbox{Pr}((S+k)/m &lt;k/m)\\). Use this fact and the CLT to transform the left side of the inequality in \\(\\mbox{Pr}(S&lt;0)\\) into a standard normal.\n26. Our interest rate now increases. But it is still a very competitive interest rate. For the \\(x\\) you obtained in 25, what is expected profit per loan and the expected total profit?\n27. Run run a Monte Carlo simulation to double check the theoretical approximation used in 25 and 26.\n28. One of your employees points out that, since the bank is making a profit per loan, the bank should give out more loans! Why limit it to just \\(n\\)? You explain that finding those \\(n\\) clients was hard. You need a group that is predictable and that keeps the chances of defaults low. The employee then points out that even if the probability of default is higher, as long as our expected value is positive, you can minimize your chances of losses by increasing \\(n\\) and relying on the law of large numbers. Suppose the default probability is twice as high, or 4%, and you set the interest rate to 5%, or \\(x=\\)$9,000, what is your expected profit per loan?\n29. How much do we have to increase \\(n\\) by to assure the probability of losing money is still less than 0.01?\n30. Confirm the result in exercise 29 with a Monte Carlo simulation.\n31. According to this equation, giving out more loans increases your expected profit and lowers the chances of losing money! Giving out more loans seems like a no-brainier. As a result, your colleague decides to leave your bank and start his own high-risk mortgage company. A few months later, your colleague’s bank has gone bankrupt. A book is written, and eventually, the movies “The Big Short” and “Margin Call” are made, recounting the mistake your friend, and many others, made. What happened?\nYour colleague’s scheme was mainly based on this mathematical formula \\(\\mbox{SE}\\left(\\bar{X}\\right) = \\sigma / \\sqrt{n}\\). By making \\(n\\) large, we minimize the standard error of our per-loan profit. However, for this rule to hold, the \\(X\\)s must be independent draws: one person defaulting must be independent of others defaulting.\nTo construct a more realistic simulation than the original one your colleague ran, let’s assume there is a global event affecting everybody with high-risk mortgages and altering their probability simultaneously. We will assume that with a 50-50 chance all the default probabilities slightly increase or decrease to somewhere between 0.03 and 0.05. However, this change occurs universally, impacting everybody at once, not just one person. As these draws are no longer independent, our equation for the standard error of the sum of random varaibles does not apply. Write a Monte Carlo simulation for your total profit with this model.\n32. Use the simulation results to report the expected profit, the probability of losing money, and the probability of losing more than $10,000,000. Study the distribution of profit and discuss how making the wrong assumption lead to a catastrophic result."
   },
   {
     "objectID": "prob/random-variables-sampling-models-clt.html#footnotes",
@@ -382,35 +382,35 @@
     "href": "inference/parameters-estimates.html#the-sampling-model-for-polls",
     "title": "\n6  Parameters and Estimates\n",
     "section": "\n6.1 The sampling model for polls",
-    "text": "6.1 The sampling model for polls\nWe start by connecting probability theory to the task of using polls to learn about a population.\nAlthough typically the results of these polls are kept private, similar polls are conducted by news organizations because results tend to be of interest to the general public and made public. We will eventually be looking at such data.\nReal Clear Politics1 is an example of a news aggregator that organizes and publishes poll results. For example, they present the following poll results reporting estimates of the popular vote for the 2016 presidential election2:\n\n\n\n\nPoll\nDate\nSample\nMoE\nClinton\nTrump\nSpread\n\n\n\nRCP Average\n10/31 - 11/7\n--\n--\n47.2\n44.3\nClinton +2.9\n\n\nBloomberg\n11/4 - 11/6\n799 LV\n3.5\n46.0\n43.0\nClinton +3\n\n\nEconomist\n11/4 - 11/7\n3669 LV\n--\n49.0\n45.0\nClinton +4\n\n\nIBD\n11/3 - 11/6\n1026 LV\n3.1\n43.0\n42.0\nClinton +1\n\n\nABC\n11/3 - 11/6\n2220 LV\n2.5\n49.0\n46.0\nClinton +3\n\n\nFOX News\n11/3 - 11/6\n1295 LV\n2.5\n48.0\n44.0\nClinton +4\n\n\nMonmouth\n11/3 - 11/6\n748 LV\n3.6\n50.0\n44.0\nClinton +6\n\n\nCBS News\n11/2 - 11/6\n1426 LV\n3.0\n47.0\n43.0\nClinton +4\n\n\nLA Times\n10/31 - 11/6\n2935 LV\n4.5\n43.0\n48.0\nTrump +5\n\n\nNBC News\n11/3 - 11/5\n1282 LV\n2.7\n48.0\n43.0\nClinton +5\n\n\nNBC News\n10/31 - 11/6\n30145 LV\n1.0\n51.0\n44.0\nClinton +7\n\n\nMcClatchy\n11/1 - 11/3\n940 LV\n3.2\n46.0\n44.0\nClinton +2\n\n\nReuters\n10/31 - 11/4\n2244 LV\n2.2\n44.0\n40.0\nClinton +4\n\n\nGravisGravis\n10/31 - 10/31\n5360 RV\n1.3\n50.0\n50.0\nTie\n\n\n\n\n\n\nLet’s make some observations about the table above. First, note that different polls, all taken days before the election, report a different spread: the estimated difference between support for the two candidates. Notice also that the reported spreads hover around what ended up being the actual result: Clinton won the popular vote by 2.1%. We also see a column titled MoE which stands for margin of error.\nTo help us understand the connection between polls and what we have learned, let’s construct a similar situation to the one pollsters face. To mimic the challenge real pollsters face in terms of competing with other pollsters for media attention, we will use an urn full of beads to represent voters and pretend we are competing for a $25 dollar prize. The challenge is to guess the spread between the proportion of blue and red beads in this urn (in this case, a pickle jar):\n\n\n\n\n\n\n\n\nBefore making a prediction, you can take a sample (with replacement) from the urn. To mimic the fact that running polls is expensive, it costs you $0.10 per each bead you sample. Therefore, if your sample size is 250, and you win, you will break even since you will pay $25 to collect your $25 prize. Your entry into the competition can be an interval. If the interval you submit contains the true proportion, you get half what you paid and pass to the second phase of the competition. In the second phase, the entry with the smallest interval is selected as the winner.\nThe dslabs package includes a function that shows a random draw from this urn:\n\nlibrary(tidyverse)\nlibrary(dslabs)\ntake_poll(25)\n\n\n\n\n\n\n\n\n\nThink about how you would construct your interval based on the data shown above.\nWe have just described a simple sampling model for opinion polls. The beads inside the urn represent the individuals that will vote on election day. Those that will vote for the Republican candidate are represented with red beads and the Democrats with the blue beads. For simplicity, assume there are no other colors. That is, that there are just two parties: Republican and Democratic."
+    "text": "6.1 The sampling model for polls\nWe start by connecting probability theory to the task of using polls to learn about a population.\nAlthough typically the results of polls run by political candidates are kept private, polls are also conducted by news organizations because results tend to be of interest to the general public and made public. We will eventually be looking at these public datasets.\nReal Clear Politics1 is an example of a news aggregator that organizes and publishes poll results. For example, they present the following poll results, reporting estimates of the popular vote for the 2016 presidential election2:\n\n\n\n\nPoll\nDate\nSample\nMoE\nClinton\nTrump\nSpread\n\n\n\nRCP Average\n10/31 - 11/7\n--\n--\n47.2\n44.3\nClinton +2.9\n\n\nBloomberg\n11/4 - 11/6\n799 LV\n3.5\n46.0\n43.0\nClinton +3\n\n\nEconomist\n11/4 - 11/7\n3669 LV\n--\n49.0\n45.0\nClinton +4\n\n\nIBD\n11/3 - 11/6\n1026 LV\n3.1\n43.0\n42.0\nClinton +1\n\n\nABC\n11/3 - 11/6\n2220 LV\n2.5\n49.0\n46.0\nClinton +3\n\n\nFOX News\n11/3 - 11/6\n1295 LV\n2.5\n48.0\n44.0\nClinton +4\n\n\nMonmouth\n11/3 - 11/6\n748 LV\n3.6\n50.0\n44.0\nClinton +6\n\n\nCBS News\n11/2 - 11/6\n1426 LV\n3.0\n47.0\n43.0\nClinton +4\n\n\nLA Times\n10/31 - 11/6\n2935 LV\n4.5\n43.0\n48.0\nTrump +5\n\n\nNBC News\n11/3 - 11/5\n1282 LV\n2.7\n48.0\n43.0\nClinton +5\n\n\nNBC News\n10/31 - 11/6\n30145 LV\n1.0\n51.0\n44.0\nClinton +7\n\n\nMcClatchy\n11/1 - 11/3\n940 LV\n3.2\n46.0\n44.0\nClinton +2\n\n\nReuters\n10/31 - 11/4\n2244 LV\n2.2\n44.0\n40.0\nClinton +4\n\n\nGravisGravis\n10/31 - 10/31\n5360 RV\n1.3\n50.0\n50.0\nTie\n\n\n\n\n\n\nLet’s make some observations about the table above. First, observe that different polls, all conducted days before the election, report different spreads: the estimated difference between support for the two candidates. Notice that the reported spreads hover around what eventually became the actual result: Clinton won the popular vote by 2.1%. Additionally, we o see a column titled MoE which stands for margin of error.\nTo help us understand the connection between polls and what we have learned, let’s construct a situation similar to what pollsters face. To simulate the challenge pollsters encounter in terms of competing with other pollsters for media attention, we will use an urn filled with beads to represent voters, and pretend we are competing for a $25 dollar prize. The challenge is to guess the spread between the proportion of blue and red beads in this urn (in this case, a pickle jar):\n\n\n\n\n\n\n\n\nBefore making a prediction, you can take a sample (with replacement) from the urn. To reflect the fact that running polls is expensive, it costs you $0.10 for each bead you sample. Therefore, if your sample size is 250, and you win, you will break even since you would have paid $25 to collect your $25 prize. Your entry into the competition can be an interval. If the interval you submit contains the true proportion, you receive half what you paid and proceed to the second phase of the competition. In the second phase, the entry with the smallest interval is selected as the winner.\nThe dslabs package includes a function that shows a random draw from this urn:\n\nlibrary(tidyverse)\nlibrary(dslabs)\ntake_poll(25)\n\n\n\n\n\n\n\n\n\nThink about how you would construct your interval based on the data shown above.\nWe have just described a simple sampling model for opinion polls. In this model, the beads inside the urn represent individuals who will vote on election day. The red beads represent those voting for the Republican candidate, while the blue beads represent the Democrats. For simplicity, let’s assume there are no other colors;that is, that there are just two parties: Republican and Democratic."
   },
   {
     "objectID": "inference/parameters-estimates.html#populations-samples-parameters-and-estimates",
     "href": "inference/parameters-estimates.html#populations-samples-parameters-and-estimates",
     "title": "\n6  Parameters and Estimates\n",
     "section": "\n6.2 Populations, samples, parameters, and estimates",
-    "text": "6.2 Populations, samples, parameters, and estimates\nWe want to predict the proportion of blue beads in the urn. Let’s call this quantity \\(p\\), which then tells us the proportion of red beads \\(1-p\\), and the spread \\(p - (1-p)\\), which simplifies to \\(2p - 1\\).\nIn statistical textbooks, the beads in the urn are called the population. The proportion of blue beads in the population \\(p\\) is called a parameter. The 25 beads we see in the previous plot are called a sample. The task of statistical inference is to predict the parameter \\(p\\) using the observed data in the sample.\nCan we do this with the 25 observations above? It is certainly informative. For example, given that we see 13 red and 12 blue beads, it is unlikely that \\(p\\) &gt; .9 or \\(p\\) &lt; .1. But are we ready to predict with certainty that there are more red beads than blue in the jar?\nWe want to construct an estimate of \\(p\\) using only the information we observe. An estimate should be thought of as a summary of the observed data that we think is informative about the parameter of interest. It seems intuitive to think that the proportion of blue beads in the sample \\(0.48\\) must be at least related to the actual proportion \\(p\\). But do we simply predict \\(p\\) to be 0.48? First, remember that the sample proportion is a random variable. If we run the command take_poll(25) four times, we get a different answer each time, since the sample proportion is a random variable.\n\n\n\n\n\n\n\n\nNote that in the four random samples shown above, the sample proportions range from 0.44 to 0.60. By describing the distribution of this random variable, we will be able to gain insights into how good this estimate is and how we can make it better.\n\n6.2.1 The sample average\nConducting an opinion poll is being modeled as taking a random sample from an urn. We are proposing the use of the proportion of blue beads in our sample as an estimate of the parameter \\(p\\). Once we have this estimate, we can easily report an estimate for the spread \\(2p-1\\), but for simplicity we will illustrate the concepts for estimating \\(p\\). We will use our knowledge of probability to defend our use of the sample proportion and quantify how close we think it is from the population proportion \\(p\\).\nWe start by defining the random variable \\(X\\) as: \\(X=1\\) if we pick a blue bead at random and \\(X=0\\) if it is red. This implies that the population is a list of 0s and 1s. If we sample \\(N\\) beads, then the average of the draws \\(X_1, \\dots, X_N\\) is equivalent to the proportion of blue beads in our sample. This is because adding the \\(X\\)s is equivalent to counting the blue beads and dividing this count by the total \\(N\\) is equivalent to computing a proportion. We use the symbol \\(\\bar{X}\\) to represent this average. In general, in statistics textbooks a bar on top of a symbol means the average. The theory we just learned about the sum of draws becomes useful because the average is a sum of draws multiplied by the constant \\(1/N\\):\n\\[\\bar{X} = \\frac{1}{N} \\sum_{i=1}^N X_i\\]\nFor simplicity, let’s assume that the draws are independent: after we see each sampled bead, we return it to the urn. In this case, what do we know about the distribution of the sum of draws? First, we know that the expected value of the sum of draws is \\(N\\) times the average of the values in the urn. We know that the average of the 0s and 1s in the urn must be \\(p\\), the proportion of blue beads.\nHere we encounter an important difference with what we did in the Probability chapter: we don’t know what is in the urn. We know there are blue and red beads, but we don’t know how many of each. This is what we want to find out: we are trying to estimate \\(p\\).\n\n6.2.2 Parameters\nJust like we use variables to define unknowns in systems of equations, in statistical inference we define parameters to define unknown parts of our models. In the urn model which we are using to mimic an opinion poll, we do not know the proportion of blue beads in the urn. We define the parameters \\(p\\) to represent this quantity. \\(p\\) is the average of the urn because if we take the average of the 1s (blue) and 0s (red), we get the proportion of blue beads. Since our main goal is figuring out what is \\(p\\), we are going to estimate this parameter.\nThe ideas presented here on how we estimate parameters, and provide insights into how good these estimates are, extrapolate to many data science tasks. For example, we may want to determine the difference in health improvement between patients receiving treatment and a control group. We may ask, what are the health effects of smoking on a population? What are the differences in racial groups of fatal shootings by police? What is the rate of change in life expectancy in the US during the last 10 years? All these questions can be framed as a task of estimating a parameter from a sample."
+    "text": "6.2 Populations, samples, parameters, and estimates\nWe want to predict the proportion of blue beads in the urn. Let’s call this quantity \\(p\\), which then tells us the proportion of red beads \\(1-p\\), and the spread \\(p - (1-p)\\), which simplifies to \\(2p - 1\\).\nIn statistical textbooks, the beads in the urn are called the population. The proportion of blue beads in the population \\(p\\) is called a parameter. The 25 beads we see in the previous plot are called a sample. The goal of statistical inference is to predict the parameter \\(p\\) based on the observed data in the sample.\nCan we do this with the 25 observations above? It is certainly informative. For example, given that we see 13 red and 12 blue beads, it is unlikely that \\(p\\) &gt; .9 or \\(p\\) &lt; .1. But are we ready to predict with certainty that there are more red beads than blue in the jar?\nWe want to construct an estimate of \\(p\\) using only the information we observe. An estimate should be thought of as a summary of the observed data that we think is informative about the parameter of interest. It seems intuitive to think that the proportion of blue beads in the sample \\(0.48\\) must be at least related to the actual proportion \\(p\\). But do we simply predict \\(p\\) to be 0.48? First, remember that the sample proportion is a random variable. If we run the command take_poll(25) four times, we get a different answer each time, since the sample proportion is a random variable.\n\n\n\n\n\n\n\n\nObserve that in the four random samples shown above, the sample proportions range from 0.44 to 0.60. By describing the distribution of this random variable, we will be able to gain insights into how good this estimate is and how we can improve it.\n\n6.2.1 The sample average\nConducting an opinion poll is being modeled as taking a random sample from an urn. We propose using the proportion of blue beads in our sample as an estimate of the parameter \\(p\\). Once we have this estimate, we can easily report an estimate for the spread \\(2p-1\\). However, for simplicity, we will illustrate the concepts for estimating \\(p\\). We will use our knowledge of probability to justify our use of the sample proportion and to quantify its proximity to the population proportion \\(p\\).\nWe start by defining the random variable \\(X\\) as \\(X=1\\), if we pick a blue bead at random, and \\(X=0\\) if it is red. This implies that the population is a list of 0s and 1s. If we sample \\(N\\) beads, then the average of the draws \\(X_1, \\dots, X_N\\) is equivalent to the proportion of blue beads in our sample. This is because adding the \\(X\\)s is equivalent to counting the blue beads, and dividing this count by the total \\(N\\) is equivalent to computing a proportion. We use the symbol \\(\\bar{X}\\) to represent this average. In statistics textbooks, a bar on top of a symbol typically denotes the average. The theory we just covered about the sum of draws becomes useful because the average is a sum of draws multiplied by the constant \\(1/N\\):\n\\[\\bar{X} = \\frac{1}{N} \\sum_{i=1}^N X_i\\]\nFor simplicity, let’s assume that the draws are independent; after we see each sampled bead, we return it to the urn. In this case, what do we know about the distribution of the sum of draws? Firstly, we know that the expected value of the sum of draws is \\(N\\) times the average of the values in the urn. We know that the average of the 0s and 1s in the urn must be \\(p\\), the proportion of blue beads.\nHere, we encounter an important difference compared to what we did in the section on probability: we don’t know the composition of the urn. While we know there are blue and red beads, we don’t know how many of each. This is what we want to find out: we are trying to estimate \\(p\\).\n\n6.2.2 Parameters\nJust as we use variables to define unknowns in systems of equations, in statistical inference, we define parameters to represent unknown parts of our models. In the urn model, which we are using to simulate an opinion poll, we do not know the proportion of blue beads in the urn. We define the parameters \\(p\\) to represent this quantity. Since our main goal is determining \\(p\\), we are going to estimate this parameter.\n\n\n\n\n\n\nIntroductory statistics textbooks usually use the population average as the first example of a parameter. Note that in our example the parameter of interest \\(p\\) is defined by the proportion of 1s (blue) and 0s (red) in the urn, which is also the average of the numbers in the urn. Our parameter of interest can therefore be thought of as a population average.\n\n\n\nThe concepts presented here on how we estimate parameters, and provide insights into how good these estimates are, extend to many data analysis tasks. For example, we may want to determine the difference in health improvement between patients receiving treatment and a control group, investigate the health effects of smoking on a population, analyze the differences in racial groups of fatal shootings by police, or assess the rate of change in life expectancy in the US during the last 10 years. All these questions can be framed as a task of estimating a parameter from a sample."
   },
   {
     "objectID": "inference/parameters-estimates.html#polling-versus-forecasting",
     "href": "inference/parameters-estimates.html#polling-versus-forecasting",
     "title": "\n6  Parameters and Estimates\n",
     "section": "\n6.3 Polling versus forecasting",
-    "text": "6.3 Polling versus forecasting\nBefore we continue, let’s make an important clarification related to the practical problem of forecasting the election. If a poll is conducted four months before the election, it is estimating the \\(p\\) for that moment and not for election day. The \\(p\\) for election night might be different since people’s opinions fluctuate through time. The polls provided the night before the election tend to be the most accurate since opinions don’t change that much in a day. However, forecasters try to build tools that model how opinions vary across time and try to predict the election night results taking into consideration the fact that opinions fluctuate. We will describe some approaches for doing this in a later section."
+    "text": "6.3 Polling versus forecasting\nBefore we continue, it’s important to clarify a practical issue related to forecasting an election. If a poll is conducted four months before the election, it is estimating the \\(p\\) for that moment, and not for election day. The \\(p\\) for election night might be different, as people’s opinions tend to fluctuate through time. Generally, the polls conducted the night before the election tend to be the most accurate, since opinions do not change significantly in a day. However, forecasters try to develop tools that model how opinions vary over time and aim to predict the election night results by taking into consideration these fluctuations. We will explore some approaches for doing this in a later section."
   },
   {
     "objectID": "inference/parameters-estimates.html#properties-of-our-estimate-expected-value-and-standard-error",
     "href": "inference/parameters-estimates.html#properties-of-our-estimate-expected-value-and-standard-error",
     "title": "\n6  Parameters and Estimates\n",
     "section": "\n6.4 Properties of our estimate: expected value and standard error",
-    "text": "6.4 Properties of our estimate: expected value and standard error\nTo understand how good our estimate is, we will describe the statistical properties of the random variable defined above: the sample proportion \\(\\bar{X}\\). Remember that \\(\\bar{X}\\) is the sum of independent draws so the rules we covered in the probability chapter apply.\nUsing what we have learned, the expected value of the sum \\(N\\bar{X}\\) is \\(N \\times\\) the average of the urn, \\(p\\). So dividing by the non-random constant \\(N\\) gives us that the expected value of the average \\(\\bar{X}\\) is \\(p\\). We can write it using our mathematical notation:\n\\[\n\\mbox{E}(\\bar{X}) = p\n\\]\nWe can also use what we learned to figure out the standard error: the standard error of the sum is \\(\\sqrt{N} \\times\\) the standard deviation of the urn. Can we compute the standard error of the urn? We learned a formula that tells us that it is \\((1-0) \\sqrt{p (1-p)}\\) = \\(\\sqrt{p (1-p)}\\). Because we are dividing the sum by \\(N\\), we arrive at the following formula for the standard error of the average:\n\\[\n\\mbox{SE}(\\bar{X}) = \\sqrt{p(1-p)/N}\n\\]\nThis result reveals the power of polls. The expected value of the sample proportion \\(\\bar{X}\\) is the parameter of interest \\(p\\) and we can make the standard error as small as we want by increasing \\(N\\). The law of large numbers tells us that with a large enough poll, our estimate converges to \\(p\\).\nIf we take a large enough poll to make our standard error about 1%, we will be quite certain about who will win. But how large does the poll have to be for the standard error to be this small?\nOne problem is that we do not know \\(p\\), so we can’t compute the standard error. However, for illustrative purposes, let’s assume that \\(p=0.51\\) and make a plot of the standard error versus the sample size \\(N\\):\n\n\n\n\n\n\n\n\nFrom the plot we see that we would need a poll of over 10,000 people to get the standard error that low. We rarely see polls of this size due in part to costs. From the Real Clear Politics table, we learn that the sample sizes in opinion polls range from 500-3,500 people. For a sample size of 1,000 and \\(p=0.51\\), the standard error is:\n\nsqrt(p*(1 - p))/sqrt(1000)\n#&gt; [1] 0.0158\n\nor 1.5 percentage points. So even with large polls, for close elections, \\(\\bar{X}\\) can lead us astray if we don’t realize it is a random variable. Nonetheless, we can actually say more about how close we get the \\(p\\) and we do that in Section Chapter 7."
+    "text": "6.4 Properties of our estimate: expected value and standard error\nTo understand how good our estimate is, we will describe the statistical properties of the random variable defined above: the sample proportion \\(\\bar{X}\\). Remember that \\(\\bar{X}\\) is the sum of independent draws so the rules we covered in the probability chapter apply.\nApplying the concepts we have learned, the expected value of the sum \\(N\\bar{X}\\) is \\(N \\times\\) the average of the urn, denoted as \\(p\\). Dividing by the non-random constant \\(N\\) yields the expected value of the average \\(\\bar{X}\\) as \\(p\\). We can write it using our mathematical notation:\n\\[\n\\mbox{E}(\\bar{X}) = p\n\\]\nWe can also use what we learned to determine the standard error: the standard error of the sum is \\(\\sqrt{N} \\times\\) the standard deviation of the urn. Can we compute the standard error of the urn? We learned a formula that tells us it is \\((1-0) \\sqrt{p (1-p)}\\) = \\(\\sqrt{p (1-p)}\\). Because we are dividing the sum by \\(N\\), we arrive at the following formula for the standard error of the average:\n\\[\n\\mbox{SE}(\\bar{X}) = \\sqrt{p(1-p)/N}\n\\]\nThis result reveals the power of polls. The expected value of the sample proportion \\(\\bar{X}\\) is the parameter of interest \\(p\\), and we can make the standard error as small as we want by increasing \\(N\\). The law of large numbers tells us that with a large enough poll, our estimate converges to \\(p\\).\nIf we take a large enough poll to make our standard error about 1%, we will be quite certain about who will win. But how large does the poll have to be for the standard error to be this small?\nOne problem is that we do not know \\(p\\), so we can’t compute the standard error. However, for illustrative purposes, let’s assume that \\(p=0.51\\) and make a plot of the standard error versus the sample size \\(N\\):\n\n\n\n\n\n\n\n\nThe plot shows that we would need a poll of over 10,000 people to achieve a standard error that low. We rarely see polls of this size due in part to the associated costs. According to the Real Clear Politics table, sample sizes in opinion polls range from 500-3,500 people. For a sample size of 1,000 and \\(p=0.51\\), the standard error is:\n\nsqrt(p*(1 - p))/sqrt(1000)\n#&gt; [1] 0.0158\n\nor 1.5 percentage points. So even with large polls, for close elections, \\(\\bar{X}\\) can lead us astray if we don’t realize it is a random variable. Nonetheless, we can actually say more about how close we get the \\(p\\) and we do that in Chapter 7."
   },
   {
     "objectID": "inference/parameters-estimates.html#exercises",
     "href": "inference/parameters-estimates.html#exercises",
     "title": "\n6  Parameters and Estimates\n",
     "section": "\n6.5 Exercises",
-    "text": "6.5 Exercises\n1. Suppose you poll a population in which a proportion \\(p\\) of voters are Democrats and \\(1-p\\) are Republicans. Your sample size is \\(N=25\\). Consider the random variable \\(S\\) which is the total number of Democrats in your sample. What is the expected value of this random variable? Hint: it’s a function of \\(p\\).\n2. What is the standard error of \\(S\\) ? Hint: it’s a function of \\(p\\).\n3. Consider the random variable \\(S/N\\). This is equivalent to the sample average, which we have been denoting as \\(\\bar{X}\\). What is the expected value of the \\(\\bar{X}\\)? Hint: it’s a function of \\(p\\).\n4. What is the standard error of \\(\\bar{X}\\)? Hint: it’s a function of \\(p\\).\n5. Write a line of code that gives you the standard error se for the problem above for several values of \\(p\\), specifically for p &lt;- seq(0, 1, length = 100). Make a plot of se versus p.\n6. Copy the code above and put it inside a for-loop to make the plot for \\(N=25\\), \\(N=100\\), and \\(N=1000\\).\n7. If we are interested in the difference in proportions, \\(\\mu = p - (1-p)\\), our estimate is \\(\\hat{\\mu} = \\bar{X} - (1-\\bar{X})\\). Use the rules we learned about sums of random variables and scaled random variables to derive the expected value of \\(\\hat{\\mu}\\).\n8. What is the standard error of \\(\\hat{\\mu}\\)?\n9. If the actual \\(p=.45\\), it means the Republicans are winning by a relatively large margin since \\(\\mu = -.1\\), which is a 10% margin of victory. In this case, what is the standard error of \\(2\\hat{X}-1\\) if we take a sample of \\(N=25\\)?\n10. Given the answer to 9, which of the following best describes your strategy of using a sample size of \\(N=25\\)?\n\nThe expected value of our estimate \\(2\\bar{X}-1\\) is \\(\\mu\\), so our prediction will be right on.\nOur standard error is larger than the difference, so the chances of \\(2\\bar{X}-1\\) being positive and throwing us off were not that small. We should pick a larger sample size.\nThe difference is 10% and the standard error is about 0.2, therefore much smaller than the difference.\nBecause we don’t know \\(p\\), we have no way of knowing that making \\(N\\) larger would actually improve our standard error."
+    "text": "6.5 Exercises\n1. Suppose you poll a population in which a proportion \\(p\\) of voters are Democrats and \\(1-p\\) are Republicans. Your sample size is \\(N=25\\). Consider the random variable \\(S\\), which is the total number of Democrats in your sample. What is the expected value of this random variable? Hint: It’s a function of \\(p\\).\n2. What is the standard error of \\(S\\) ? Hint: It’s a function of \\(p\\).\n3. Consider the random variable \\(S/N\\). This is equivalent to the sample average, which we have been denoting as \\(\\bar{X}\\). What is the expected value of the \\(\\bar{X}\\)? Hint: It’s a function of \\(p\\).\n4. What is the standard error of \\(\\bar{X}\\)? Hint: It’s a function of \\(p\\).\n5. Write a line of code that gives you the standard error se for the problem above for several values of \\(p\\), specifically for p &lt;- seq(0, 1, length = 100). Make a plot of se versus p.\n6. Copy the code above and put it inside a for-loop to make the plot for \\(N=25\\), \\(N=100\\), and \\(N=1000\\).\n7. If we are interested in the difference in proportions, \\(\\mu = p - (1-p)\\), our estimate is \\(\\hat{\\mu} = \\bar{X} - (1-\\bar{X})\\). Use the rules we learned about sums of random variables and scaled random variables to derive the expected value of \\(\\hat{\\mu}\\).\n8. What is the standard error of \\(\\hat{\\mu}\\)?\n9. If the actual \\(p=.45\\), it means the Republicans are winning by a relatively large margin, since \\(\\mu = -.1\\), which is a 10% margin of victory. In this case, what is the standard error of \\(2\\hat{X}-1\\) if we take a sample of \\(N=25\\)?\n10. Given the answer to exercise 9, which of the following best describes your strategy of using a sample size of \\(N=25\\)?\n\nThe expected value of our estimate \\(2\\bar{X}-1\\) is \\(\\mu\\), so our prediction will be accurate.\nOur standard error is larger than the difference, so the chances of \\(2\\bar{X}-1\\) representing a large margin are not small. We should pick a larger sample size.\nThe difference is 10% and the standard error is about 0.2, therefore much smaller than the difference.\nBecause we don’t know \\(p\\), we have no way of knowing that making \\(N\\) larger would actually improve our standard error."
   },
   {
     "objectID": "inference/parameters-estimates.html#footnotes",
@@ -424,166 +424,187 @@
     "href": "inference/clt.html#a-monte-carlo-simulation",
     "title": "7  Central Limit Theorem",
     "section": "\n7.1 A Monte Carlo simulation",
-    "text": "7.1 A Monte Carlo simulation\nSuppose we want to use a Monte Carlo simulation to corroborate the tools we have built using probability theory. To create the simulation, we would write code like this:\n\nB &lt;- 10000\nN &lt;- 1000\nx_hat &lt;- replicate(B, {\n  x &lt;- sample(c(0,1), size = N, replace = TRUE, prob = c(1-p, p))\n  mean(x)\n})\n\nThe problem is, of course, we don’t know p. We could construct an urn like the one pictured above and run an analog (without a computer) simulation. It would take a long time, but you could take 10,000 samples, count the beads and keep track of the proportions of blue. We can use the function take_poll(n=1000) instead of drawing from an actual urn, but it would still take time to count the beads and enter the results.\nOne thing we therefore do to corroborate theoretical results is to pick one or several values of p and run the simulations. Let’s set p=0.45. We can then simulate a poll:\n\np &lt;- 0.45\nN &lt;- 1000\n\nx &lt;- sample(c(0, 1), size = N, replace = TRUE, prob = c(1 - p, p))\nx_hat &lt;- mean(x)\n\nIn this particular sample, our estimate is x_hat. We can use that code to do a Monte Carlo simulation:\n\nB &lt;- 10000\nx_hat &lt;- replicate(B, {\n  x &lt;- sample(c(0, 1), size = N, replace = TRUE, prob = c(1 - p, p))\n  mean(x)\n})\n\nTo review, the theory tells us that \\(\\bar{X}\\) is approximately normally distributed, has expected value \\(p=\\) 0.45 and standard error \\(\\sqrt{p(1-p)/N}\\) = 0.0157321. The simulation confirms this:\n\nmean(x_hat)\n#&gt; [1] 0.45\nsd(x_hat)\n#&gt; [1] 0.0158\n\nA histogram and qq-plot confirm that the normal approximation is accurate as well:\n\n\n\n\n\n\n\n\nOf course, in real life we would never be able to run such an experiment because we don’t know \\(p\\). But we could run it for various values of \\(p\\) and \\(N\\) and see that the theory does indeed work well for most values. You can easily do this by re-running the code above after changing p and N."
+    "text": "7.1 A Monte Carlo simulation\nSuppose we want to use a Monte Carlo simulation to corroborate the tools we have developed using probability theory. To create the simulation, we would write code like this:\n\nB &lt;- 10000\nN &lt;- 1000\nx_hat &lt;- replicate(B, {\n  x &lt;- sample(c(0,1), size = N, replace = TRUE, prob = c(1 - p, p))\n  mean(x)\n})\n\nThe problem is, of course, that we don’t know p. We could construct an urn, similar to the one pictured above, and conduct an analog simulation (without a computer). While time-consuming, we could take 10,000 samples, count the beads, and track the proportions of blue. We can use the function take_poll(n=1000), instead of drawing from an actual urn, but it would still take time to count the beads and enter the results.\nTherefore, one approach we can use to corroborate theoretical results is to pick one or several values of p and run simulations. Let’s set p=0.45. We can then simulate a poll:\n\np &lt;- 0.45\nN &lt;- 1000\n\nx &lt;- sample(c(0, 1), size = N, replace = TRUE, prob = c(1 - p, p))\nx_hat &lt;- mean(x)\n\nIn this particular sample, our estimate is x_hat. We can use that code to do a Monte Carlo simulation:\n\nB &lt;- 10000\nx_hat &lt;- replicate(B, {\n  x &lt;- sample(c(0, 1), size = N, replace = TRUE, prob = c(1 - p, p))\n  mean(x)\n})\n\nTo review, the theory tells us that \\(\\bar{X}\\) is approximately normally distributed, has expected value \\(p=\\) 0.45, and standard error \\(\\sqrt{p(1-p)/N}\\) = 0.0157321. The simulation confirms this:\n\nmean(x_hat)\n#&gt; [1] 0.45\nsd(x_hat)\n#&gt; [1] 0.0157\n\nA histogram and qqplot confirm that the normal approximation is also accurate:\n\n\n\n\n\n\n\n\nOf course, in real life, we would never be able to run such an experiment because we don’t know \\(p\\). However, we can run it for various values of \\(p\\) and \\(N\\) and see that the theory does indeed work well for most values. You can easily do this by rerunning the code above after changing the values of p and N."
   },
   {
     "objectID": "inference/clt.html#the-spread",
     "href": "inference/clt.html#the-spread",
     "title": "7  Central Limit Theorem",
     "section": "\n7.2 The spread",
-    "text": "7.2 The spread\nThe competition is to predict the spread, not the proportion \\(p\\). However, because we are assuming there are only two parties, we know that the spread is \\(\\mu = p - (1-p) = 2p - 1\\). As a result, everything we have done can easily be adapted to an estimate of \\(\\mu\\). Once we have our estimate \\(\\bar{X}\\) and \\(\\hat{\\mbox{SE}}(\\bar{X})\\), we estimate the spread with \\(2\\bar{X} - 1\\) and, since we are multiplying by 2, the standard error is \\(2\\hat{\\mbox{SE}}(\\bar{X})\\). Note that subtracting 1 does not add any variability so it does not affect the standard error.\nFor our 25 item sample above, our estimate \\(p\\) is .48 with margin of error .20 and our estimate of the spread is 0.04 with margin of error .40. Again, not a very useful sample size. However, the point is that once we have an estimate and standard error for \\(p\\), we have it for the spread \\(\\mu\\).\n\n\n\n\n\n\nWe use \\(\\mu\\) the denote the spread here and in the next chapters because this is the typical notation used in statistical textbooks for the parameter of interest. The reason we use \\(\\mu\\) is because a populuation mean is often the parameter of interest and \\(\\mu\\) is the Greek letter for m."
+    "text": "7.2 The spread\nThe objective of the competition is to predict the spread, not the proportion \\(p\\). However, since we are assuming there are only two parties, we know that the spread is \\(\\mu = p - (1-p) = 2p - 1\\). As a result, everything we have done can easily be adapted to an estimate of \\(\\mu\\). Once we have our estimate \\(\\bar{X}\\) and \\(\\hat{\\mbox{SE}}(\\bar{X})\\), we estimate the spread with \\(2\\bar{X} - 1\\) and, since we are multiplying by 2, the standard error is \\(2\\hat{\\mbox{SE}}(\\bar{X})\\). Note that subtracting 1 does not add any variability, so it does not affect the standard error.\nFor our 25 item sample above, our estimate \\(p\\) is .48 with margin of error .20, and our estimate of the spread is0.04with margin of error.40`. Again, this is not a very useful sample size. Nevertheless, the point is that, once we have an estimate and standard error for \\(p\\), we have it for the spread \\(\\mu\\).\n\n\n\n\n\n\nWe use \\(\\mu\\) to denote the spread here and in the next sections because this is the typical notation used in statistical textbooks for the parameter of interest. The reason we use \\(\\mu\\) is that a population mean is often the parameter of interest, and \\(\\mu\\) is the Greek letter for m."
   },
   {
     "objectID": "inference/clt.html#bias-why-not-run-a-very-large-poll",
     "href": "inference/clt.html#bias-why-not-run-a-very-large-poll",
     "title": "7  Central Limit Theorem",
-    "section": "\n7.3 Bias: why not run a very large poll?",
-    "text": "7.3 Bias: why not run a very large poll?\nFor realistic values of \\(p\\), say from 0.35 to 0.65, if we run a very large poll with 100,000 people, theory tells us that we would predict the election perfectly since the largest possible margin of error is around 0.3%:\n\n#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.\n\n\n\n\n\n\n\nOne reason is that running such a poll is very expensive. Another possibly more important reason is that theory has its limitations. Polling is much more complicated than picking beads from an urn. Some people might lie to pollsters and others might not have phones. But perhaps the most important way an actual poll differs from an urn model is that we actually don’t know for sure who is in our population and who is not. How do we know who is going to vote? Are we reaching all possible voters? Hence, even if our margin of error is very small, it might not be exactly right that our expected value is \\(p\\). We call this bias. Historically, we observe that polls are indeed biased, although not by that much. The typical bias appears to be about 1-2%. This makes election forecasting a bit more interesting and we will talk about how to model this in a later chapter."
+    "section": "\n7.3 Bias: Why not run a very large poll?",
+    "text": "7.3 Bias: Why not run a very large poll?\nFor realistic values of \\(p\\), let’s say ranging from 0.35 to 0.65, if we conduct a very large poll with 100,000 people, theory tells us that we would predict the election perfectly, as the largest possible margin of error is around 0.3%:\n\n#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.\n\n\n\n\n\n\n\nOne reason is that conducting such a poll is very expensive. Another, and possibly more important reason, is that theory has its limitations. Polling is much more complicated than simply picking beads from an urn. Some people might lie to pollsters, and others might not have phones. However, perhaps the most important way an actual poll differs from an urn model is that we don’t actually know for sure who is in our population and who is not. How do we know who is going to vote? Are we reaching all possible voters? Hence, even if our margin of error is very small, it might not be exactly right that our expected value is \\(p\\). We call this bias. Historically, we observe that polls are indeed biased, although not by a substantial amount. The typical bias appears to be about 1-2%. This makes election forecasting a bit more interesting, and we will explore how to model this in a later section."
   },
   {
     "objectID": "inference/clt.html#exercises",
     "href": "inference/clt.html#exercises",
     "title": "7  Central Limit Theorem",
     "section": "\n7.4 Exercises",
-    "text": "7.4 Exercises\n1. Write an urn model function that takes the proportion of Democrats \\(p\\) and the sample size \\(N\\) as arguments and returns the sample average if Democrats are 1s and Republicans are 0s. Call the function take_sample.\n2. Now assume p &lt;- 0.45 and that your sample size is \\(N=100\\). Take a sample 10,000 times and save the vector of mean(X) - p into an object called errors. Hint: use the function you wrote for exercise 1 to write this in one line of code.\n3. The vector errors contains, for each simulated sample, the difference between the actual \\(p\\) and our estimate \\(\\bar{X}\\). We refer to this difference as the error. Compute the average and make a histogram of the errors generated in the Monte Carlo simulation and select which of the following best describes their distributions:\n\nmean(errors)\nhist(errors)\n\n\nThe errors are all about 0.05.\nThe errors are all about -0.05.\nThe errors are symmetrically distributed around 0.\nThe errors range from -1 to 1.\n\n4. The error \\(\\bar{X}-p\\) is a random variable. In practice, the error is not observed because we do not know \\(p\\). Here we observe it because we constructed the simulation. What is the average size of the error if we define the size by taking the absolute value \\(\\mid \\bar{X} - p \\mid\\) ?\n5. The standard error is related to the typical size of the error we make when predicting. We say size because we just saw that the errors are centered around 0, so thus the average error value is 0. For mathematical reasons related to the Central Limit Theorem, we actually use the standard deviation of errors rather than the average of the absolute values to quantify the typical size. What is this standard deviation of the errors?\n6. The theory we just learned tells us what this standard deviation is going to be because it is the standard error of \\(\\bar{X}\\). What does theory tell us is the standard error of \\(\\bar{X}\\) for a sample size of 100?\n7. In practice, we don’t know \\(p\\), so we construct an estimate of the theoretical prediction based by plugging in \\(\\bar{X}\\) for \\(p\\). Compute this estimate. Set the seed at 1 with set.seed(1).\n8. Note how close the standard error estimates obtained from the Monte Carlo simulation (exercise 5), the theoretical prediction (exercise 6), and the estimate of the theoretical prediction (exercise 7) are. The theory is working and it gives us a practical approach to knowing the typical error we will make if we predict \\(p\\) with \\(\\bar{X}\\). Another advantage that the theoretical result provides is that it gives an idea of how large a sample size is required to obtain the precision we need. Earlier we learned that the largest standard errors occur for \\(p=0.5\\). Create a plot of the largest standard error for \\(N\\) ranging from 100 to 5,000. Based on this plot, how large does the sample size have to be to have a standard error of about 1%?\n\n100\n500\n2,500\n4,000\n\n9. For sample size \\(N=100\\), the central limit theorem tells us that the distribution of \\(\\bar{X}\\) is:\n\npractically equal to \\(p\\).\napproximately normal with expected value \\(p\\) and standard error \\(\\sqrt{p(1-p)/N}\\).\napproximately normal with expected value \\(\\bar{X}\\) and standard error \\(\\sqrt{\\bar{X}(1-\\bar{X})/N}\\).\nnot a random variable.\n\n10. Based on the answer from exercise 8, the error \\(\\bar{X} - p\\) is:\n\npractically equal to 0.\napproximately normal with expected value \\(0\\) and standard error \\(\\sqrt{p(1-p)/N}\\).\napproximately normal with expected value \\(p\\) and standard error \\(\\sqrt{p(1-p)/N}\\).\nnot a random variable.\n\n11. To corroborate your answer to exercise 9, make a qq-plot of the errors you generated in exercise 2 to see if they follow a normal distribution.\n12. If \\(p=0.45\\) and \\(N=100\\) as in exercise 2, use the CLT to estimate the probability that \\(\\bar{X}&gt;0.5\\). You can assume you know \\(p=0.45\\) for this calculation.\n13. Assume you are in a practical situation and you don’t know \\(p\\). Take a sample of size \\(N=100\\) and obtain a sample average of \\(\\bar{X} = 0.51\\). What is the CLT approximation for the probability that your error is equal to or larger than 0.01?"
+    "text": "7.4 Exercises\n1. Write an urn model function that takes the proportion of Democrats \\(p\\) and the sample size \\(N\\) as arguments, and returns the sample average if Democrats are 1s and Republicans are 0s. Call the function take_sample.\n2. Now assume p &lt;- 0.45 and that your sample size is \\(N=100\\). Take a sample 10,000 times and save the vector of mean(X) - p into an object called errors. Hint: Use the function you wrote for exercise 1 to write this in one line of code.\n3. The vector errors contains, for each simulated sample, the difference between the actual \\(p\\) and our estimate \\(\\bar{X}\\). We refer to this difference as the error. Compute the average and make a histogram of the errors generated in the Monte Carlo simulation, and select which of the following best describes their distributions:\n\nmean(errors)\nhist(errors)\n\n\nThe errors are all about 0.05.\nThe errors are all about -0.05.\nThe errors are symmetrically distributed around 0.\nThe errors range from -1 to 1.\n\n4. The error \\(\\bar{X}-p\\) is a random variable. In practice, the error is not observed because we do not know \\(p\\). Here, we observe it since we constructed the simulation. What is the average size of the error if we define the size by taking the absolute value \\(\\mid \\bar{X} - p \\mid\\)?\n5. The standard error is related to the typical size of the error we make when predicting. For mathematical reasons related to the Central Limit Theorem, we actually use the standard deviation of errors, rather than the average of the absolute values, to quantify the typical size. What is this standard deviation of the errors?\n6. The theory we just learned tells us what this standard deviation is going to be because it is the standard error of \\(\\bar{X}\\). What does theory tell us is the standard error of \\(\\bar{X}\\) for a sample size of 100?\n7. In practice, we don’t know \\(p\\), so we construct an estimate of the theoretical prediction based by plugging in \\(\\bar{X}\\) for \\(p\\). Compute this estimate. Set the seed at 1 with set.seed(1).\n8. Note how close the standard error estimates obtained from the Monte Carlo simulation (exercise 5), the theoretical prediction (exercise 6), and the estimate of the theoretical prediction (exercise 7) are. The theory is working and it gives us a practical approach to knowing the typical error we will make if we predict \\(p\\) with \\(\\bar{X}\\). Another advantage that the theoretical result provides is that it gives an idea of how large a sample size is required to obtain the precision we need. Earlier, we learned that the largest standard errors occur for \\(p=0.5\\). Create a plot of the largest standard error for \\(N\\) ranging from 100 to 5,000. Based on this plot, how large does the sample size have to be to have a standard error of about 1%?\n\n100\n500\n2,500\n4,000\n\n9. For sample size \\(N=100\\), the Central Limit Theorem tells us that the distribution of \\(\\bar{X}\\) is:\n\npractically equal to \\(p\\).\napproximately normal with expected value \\(p\\) and standard error \\(\\sqrt{p(1-p)/N}\\).\napproximately normal with expected value \\(\\bar{X}\\) and standard error \\(\\sqrt{\\bar{X}(1-\\bar{X})/N}\\).\nnot a random variable.\n\n10. Based on the answer from exercise 8, the error \\(\\bar{X} - p\\) is:\n\npractically equal to 0.\napproximately normal with expected value \\(0\\) and standard error \\(\\sqrt{p(1-p)/N}\\).\napproximately normal with expected value \\(p\\) and standard error \\(\\sqrt{p(1-p)/N}\\).\nnot a random variable.\n\n11. To corroborate your answer to exercise 9, make a qq-plot of the errors you generated in exercise 2 to see if they follow a normal distribution.\n12. If \\(p=0.45\\) and \\(N=100\\) as in exercise 2, use the CLT to estimate the probability that \\(\\bar{X}&gt;0.5\\). Assume you know \\(p=0.45\\) for this calculation.\n13. Assume you are in a practical situation and you don’t know \\(p\\). Take a sample of size \\(N=100\\) and obtain a sample average of \\(\\bar{X} = 0.51\\). What is the CLT approximation for the probability that your error is equal to or larger than 0.01?"
   },
   {
     "objectID": "inference/confidence-intervals.html#a-monte-carlo-simulation",
     "href": "inference/confidence-intervals.html#a-monte-carlo-simulation",
     "title": "\n8  Confidence intervals\n",
     "section": "\n8.1 A Monte Carlo simulation",
-    "text": "8.1 A Monte Carlo simulation\nWe can run a Monte Carlo simulation to confirm that, in fact, a 95% confidence interval includes \\(p\\) 95% of the time.\n\nN &lt;- 1000\nB &lt;- 10000\ninside &lt;- replicate(B, {\n  x &lt;- sample(c(0,1), size = N, replace = TRUE, prob = c(1-p, p))\n  x_hat &lt;- mean(x)\n  se_hat &lt;- sqrt(x_hat * (1 - x_hat) / N)\n  between(p, x_hat - 1.96 * se_hat, x_hat + 1.96 * se_hat)\n})\nmean(inside)\n#&gt; [1] 0.948\n\nThe following plot shows the first 100 confidence intervals. In this case, we created the simulation so the black line denotes the parameter we are trying to estimate:\n\n\n\n\n\n\n\n\n::: {.callout-note title = “The correct language”}\nWhen using the theory we described above, it is important to remember that it is the intervals that are random, not \\(p\\). In the plot above, we can see the random intervals moving around and \\(p\\), represented with the vertical line, staying in the same place. The proportion of blue in the urn \\(p\\) is not. So the 95% relates to the probability that this random interval falls on top of \\(p\\). Saying the \\(p\\) has a 95% chance of being between this and that is technically an incorrect statement because \\(p\\) is not random. :::"
+    "text": "8.1 A Monte Carlo simulation\nWe can run a Monte Carlo simulation to confirm that, in fact, a 95% confidence interval includes \\(p\\) 95% of the time.\n\nN &lt;- 1000\nB &lt;- 10000\ninside &lt;- replicate(B, {\n  x &lt;- sample(c(0,1), size = N, replace = TRUE, prob = c(1 - p, p))\n  x_hat &lt;- mean(x)\n  se_hat &lt;- sqrt(x_hat*(1 - x_hat)/N)\n  between(p, x_hat - 1.96*se_hat, x_hat + 1.96*se_hat)\n})\nmean(inside)\n#&gt; [1] 0.948\n\nThe following plot shows the first 100 confidence intervals. In this case, we created the simulation so the black line denotes the parameter we are trying to estimate:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nWhen applying the theory we described above, it’s important to remember that it’s the intervals that are random, not \\(p\\). In the plot above, we can see the random intervals moving around, while the proportion of blue beads in the urn \\(p\\), represented with the vertical line, remains in the same place. So the 95% relates to the probability that the random interval falls on top of \\(p\\). Stating that \\(p\\) has a 95% chance of being between this or that is technically incorrect because \\(p\\) is not random."
   },
   {
     "objectID": "inference/confidence-intervals.html#exercises",
     "href": "inference/confidence-intervals.html#exercises",
     "title": "\n8  Confidence intervals\n",
     "section": "\n8.2 Exercises",
-    "text": "8.2 Exercises\nFor these exercises, we will use actual polls from the 2016 election. You can load the data from the dslabs package.\n\nlibrary(dslabs)\n\nSpecifically, we will use all the national polls that ended within one week before the election.\n\nlibrary(tidyverse)\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(enddate &gt;= \"2016-10-31\" & state == \"U.S.\") \n\n1. For the first poll, you can obtain the samples size and estimated Clinton percentage with:\n\nN &lt;- polls$samplesize[1]\nx_hat &lt;- polls$rawpoll_clinton[1]/100\n\nAssume there are only two candidates and construct a 95% confidence interval for the election night proportion \\(p\\).\n2. Now use dplyr to add a confidence interval as two columns, call them lower and upper, to the object poll. Then use select to show the pollster, enddate, x_hat,lower, upper variables. Hint: define temporary columns x_hat and se_hat.\n3. The final tally for the popular vote was Clinton 48.2% and Trump 46.1%. Add a column, call it hit, to the previous table stating if the confidence interval included the true proportion \\(p=0.482\\) or not.\n4. For the table you just created, what proportion of confidence intervals included \\(p\\)?\n5. If these confidence intervals are constructed correctly, and the theory holds up, what proportion should include \\(p\\)?\n6. A much smaller proportion of the polls than expected produce confidence intervals containing \\(p\\). If you look closely at the table, you will see that most polls that fail to include \\(p\\) are underestimating. The reason for this is undecided voters, individuals polled that do not yet know who they will vote for or do not want to say. Because, historically, undecideds divide evenly between the two main candidates on election day, it is more informative to estimate the spread or the difference between the proportion of two candidates \\(\\mu\\), which in this election was \\(0. 482 - 0.461 = 0.021\\). Assume that there are only two parties and that \\(\\mu = 2p - 1\\), redefine polls as below and re-do exercise 1, but for the difference.\n\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(enddate &gt;= \"2016-10-31\" & state == \"U.S.\")  |&gt;\n  mutate(mu_hat = rawpoll_clinton / 100 - rawpoll_trump / 100)\n\n7. Now repeat exercise 3, but for the difference.\n8. Now repeat exercise 4, but for the difference.\n9. Although the proportion of confidence intervals goes up substantially, it is still lower than 0.95. In the next chapter, we learn the reason for this. To motivate this, make a plot of the error, the difference between each poll’s estimate and the actual \\(mu=0.021\\). Stratify by pollster.\n10. Redo the plot that you made for exercise 9, but only for pollsters that took five or more polls."
+    "text": "8.2 Exercises\nFor these exercises, we will use actual polls from the 2016 election. You can load the data from the dslabs package.\n\nlibrary(dslabs)\n\nSpecifically, we will use all the national polls that ended within one week prior to the election.\n\nlibrary(tidyverse)\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(enddate &gt;= \"2016-10-31\" & state == \"U.S.\") \n\n1. For the first poll, you can obtain the samples size and estimated Clinton percentage with:\n\nN &lt;- polls$samplesize[1]\nx_hat &lt;- polls$rawpoll_clinton[1]/100\n\nAssume there are only two candidates and construct a 95% confidence interval for the election night proportion \\(p\\).\n2. Now use dplyr to add a confidence interval as two columns, call them lower and upper, to the object poll. Then, use select to show the pollster, enddate, x_hat,lower, upper variables. Hint: Define temporary columns x_hat and se_hat.\n3. The final tally for the popular vote was Clinton 48.2% and Trump 46.1%. Add a column, call it hit, to the previous table stating if the confidence interval included the true proportion \\(p=0.482\\) or not.\n4. For the table you just created, what proportion of confidence intervals included \\(p\\)?\n5. If these confidence intervals are constructed correctly, and the theory holds up, what proportion should include \\(p\\)?\n6. A much smaller proportion of the polls than expected produce confidence intervals containing \\(p\\). If you look closely at the table, you will see that most polls that fail to include \\(p\\) are underestimating. The reason for this is undecided voters, individuals polled that do not yet know who they will vote for or do not want to say. Because, historically, undecideds divide evenly between the two main candidates on election day, it is more informative to estimate the spread or the difference between the proportion of two candidates, \\(\\mu\\), which in this election was \\(0. 482 - 0.461 = 0.021\\). Assume that there are only two parties and that \\(\\mu = 2p - 1\\). Redefine polls as below and re-do exercise 1, but for the difference.\n\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(enddate &gt;= \"2016-10-31\" & state == \"U.S.\")  |&gt;\n  mutate(mu_hat = rawpoll_clinton/100 - rawpoll_trump/100)\n\n7. Now repeat exercise 3, but for the difference.\n8. Now repeat exercise 4, but for the difference.\n9. Although the proportion of confidence intervals increases substantially, it is still lower than 0.95. In the next chapter, we learn the reason for this. To motivate this, make a plot of the error, the difference between each poll’s estimate and the actual \\(mu=0.021\\). Stratify by pollster.\n10. Redo the plot that you made for exercise 9, but only for pollsters that took five or more polls."
   },
   {
     "objectID": "inference/hypothesis-testing.html#p-values",
     "href": "inference/hypothesis-testing.html#p-values",
     "title": "9  Hypothesis testing",
     "section": "\n9.1 p-values",
-    "text": "9.1 p-values\nSuppose we take a random sample of \\(N=100\\) and we observe \\(52\\) blue beads, which gives us \\(\\bar{X} = 0.52\\). This seems to be pointing to the existence of more blue than red beads since 0.52 is larger than 0.5. However, we know there is chance involved in this process and we could get a 52 even when the actual \\(p=0.5\\). We call the assumption that \\(p = 0.5\\) a null hypothesis. The null hypothesis is the skeptic’s hypothesis.\nWe have observed a random variable \\(\\bar{X} = 0.52\\) and the p-value is the answer to the question: how likely is it to see a value this large, when the null hypothesis is true? If the p-value is small enough we reject the null hypothesis and say that the results are statistically significant.\n\nThe p-value of 0.05 as a threshold for statistical significance is conventionally used in many areas of research. A cutoff of 0.01 is also used to define highly significance. The choice of 0.05 is somewhat arbitrary and was popularized by the British statistician Ronald Fisher in the 1920s. We do not recommend using the cutoff without justification and try to avoid the phrase statistically significance.\n\nTo obtain a p-value for our example we write:\n\\[\\mbox{Pr}(\\mid \\bar{X} - 0.5 \\mid &gt; 0.02 ) \\]\nassuming the \\(p=0.5\\). Under the null hypothesis we know that:\n\\[\n\\sqrt{N}\\frac{\\bar{X} - 0.5}{\\sqrt{0.5(1-0.5)}}\n\\]\nis standard normal. We therefore can compute the probability above, which is the p-value.\n\\[\\mbox{Pr}\\left(\\sqrt{N}\\frac{\\mid \\bar{X} - 0.5\\mid}{\\sqrt{0.5(1-0.5)}} &gt; \\sqrt{N} \\frac{0.02}{ \\sqrt{0.5(1-0.5)}}\\right)\\]\n\nN &lt;- 100\nz &lt;- sqrt(N)*0.02/0.5\n1 - (pnorm(z) - pnorm(-z))\n#&gt; [1] 0.689\n\nIn this case, there is actually a large chance of seeing 52 or larger under the null hypothesis.\nKeep in mind that there is a close connection between p-values and confidence intervals. If a 95% confidence interval of the spread does not include 0, we know that the p-value must be smaller than 0.05.\nTo learn more about p-values, you can consult any statistics textbook. However, in general, we prefer reporting confidence intervals over p-values since it gives us an idea of the size of the estimate. If we just report the p-value we provide no information about the significance of the finding in the context of the problem.\n\nWe can show mathematically that if a \\((1-\\alpha)\\times 100\\)% confidence interval does not contain the null hypothesis value, the null hypothesis is rejected with a p-value as smaller or smaller than \\(\\alpha\\). So statistical significance can be determined from confidence intervals. However, unlike the confidence interval, the p-value does not provide an estimate of the magnitude of the effect. For this reason we recommend avoiding p-values whenever you can compute a confidence interval."
+    "text": "9.1 p-values\nSuppose we take a random sample of \\(N=100\\) and we observe \\(52\\) blue beads, which gives us \\(\\bar{X} = 0.52\\). This seems to be pointing to the existence of more blue than red beads since 0.52 is larger than 0.5. However, we know there is chance involved in this process and we could get a 52 even when the actual \\(p=0.5\\). We call the assumption that \\(p = 0.5\\) a null hypothesis. The null hypothesis is the skeptic’s hypothesis.\nWe have observed a random variable \\(\\bar{X} = 0.52\\), and the p-value is the answer to the question: How likely is it to see a value this large, when the null hypothesis is true? If the p-value is small enough, we reject the null hypothesis and say that the results are statistically significant.\n\nThe p-value of 0.05 as a threshold for statistical significance is conventionally used in many areas of research. A cutoff of 0.01 is also used to define highly significance. The choice of 0.05 is somewhat arbitrary and was popularized by the British statistician Ronald Fisher in the 1920s. We do not recommend using these cutoff without justification and recommend avoiding the phrase statistically significant.\n\nTo obtain a p-value for our example, we write:\n\\[\\mbox{Pr}(\\mid \\bar{X} - 0.5 \\mid &gt; 0.02 ) \\]\nassuming the \\(p=0.5\\). Under the null hypothesis we know that:\n\\[\n\\sqrt{N}\\frac{\\bar{X} - 0.5}{\\sqrt{0.5(1-0.5)}}\n\\]\nis standard normal. We, therefore, can compute the probability above, which is the p-value.\n\\[\\mbox{Pr}\\left(\\sqrt{N}\\frac{\\mid \\bar{X} - 0.5\\mid}{\\sqrt{0.5(1-0.5)}} &gt; \\sqrt{N} \\frac{0.02}{ \\sqrt{0.5(1-0.5)}}\\right)\\]\n\nN &lt;- 100\nz &lt;- sqrt(N)*0.02/0.5\n1 - (pnorm(z) - pnorm(-z))\n#&gt; [1] 0.689\n\nIn this case, there is actually a large chance of seeing 52 or larger under the null hypothesis.\nKeep in mind that there is a close connection between p-values and confidence intervals. If a 95% confidence interval of the spread does not include 0, we know that the p-value must be smaller than 0.05.\nTo learn more about p-values, you can consult any statistics textbook. However, in general, we prefer reporting confidence intervals over p-values because it gives us an idea of the size of the estimate. If we just report the p-value, we provide no information about the significance of the finding in the context of the problem.\n\nWe can show mathematically that if a \\((1-\\alpha)\\times 100\\)% confidence interval does not contain the null hypothesis value, the null hypothesis is rejected with a p-value as smaller or smaller than \\(\\alpha\\). So statistical significance can be determined from confidence intervals. However, unlike the confidence interval, the p-value does not provide an estimate of the magnitude of the effect. For this reason, we recommend avoiding p-values whenever you can compute a confidence interval."
   },
   {
     "objectID": "inference/hypothesis-testing.html#power",
     "href": "inference/hypothesis-testing.html#power",
     "title": "9  Hypothesis testing",
     "section": "\n9.2 Power",
-    "text": "9.2 Power\nPollsters are not successful at providing correct confidence intervals, but rather at predicting who will win. When we took a 25 bead sample size, the confidence interval for the spread:\n\nN &lt;- 25\nx_hat &lt;- 0.48\n(2 * x_hat - 1) + c(-1.96, 1.96) * 2 * sqrt(x_hat * (1 - x_hat) / N)\n#&gt; [1] -0.432  0.352\n\nincludes 0. If this were a poll and we were forced to make a declaration, we would have to say it was a “toss-up”.\nA problem with our poll results is that given the sample size and the value of \\(p\\), we would have to sacrifice on the probability of an incorrect call to create an interval that does not include 0.\nThis does not mean that the election is close. It only means that we have a small sample size. In statistical textbooks this is called lack of power. In the context of polls, power is the probability of detecting spreads different from 0.\nBy increasing our sample size, we lower our standard error and therefore have a much better chance of detecting the direction of the spread."
+    "text": "9.2 Power\nPollsters are not successful at providing correct confidence intervals, but rather at predicting who will win. When we took a 25 bead sample size, the confidence interval for the spread:\n\nN &lt;- 25\nx_hat &lt;- 0.48\n(2*x_hat - 1) + c(-1.96, 1.96)*2*sqrt(x_hat*(1 - x_hat)/N)\n#&gt; [1] -0.432  0.352\n\nincluded 0. If this were a poll and we were forced to make a declaration, we would have to say it was a “toss-up”.\nOne problem with our poll results is that, given the sample size and the value of \\(p\\), we would have to sacrifice the probability of an incorrect call to create an interval that does not include 0.\nThis does not mean that the election is close. It only means that we have a small sample size. In statistical textbooks, this is called lack of power. In the context of polls, power is the probability of detecting spreads different from 0.\nBy increasing our sample size, we lower our standard error, and thus, have a much better chance of detecting the direction of the spread."
   },
   {
     "objectID": "inference/hypothesis-testing.html#exercises",
     "href": "inference/hypothesis-testing.html#exercises",
     "title": "9  Hypothesis testing",
     "section": "\n9.3 Exercises",
-    "text": "9.3 Exercises\n\nGenerate a sample of size \\(N=50\\) from an urn model with 50% blue beads:\n\n\nN &lt;- 50 \np &lt;- 0.5\nx &lt;- rbinom(N, 1, 0.5)\n\nThen compute a p-value testing if \\(p=0.5\\). Repeat this 10,000 times and report how often do we incorrectly is the p-value lower than 0.05? How often is it lower than 0.01?\n\nMake a histogram of the p-values you generated in exercise 1. Which of the following seems to be true:\n\n\nThe p-values are all 0.05\nThe p-values are normally distributed, CLT seems to hold.\nThe p-values are uniformly distributed\nThe p-values\n\n\n(Advanced) Demonstrate mathematically why see the histogram you see.\nGenerate a sample of size \\(N=50\\) from an urn model with 52% blue beads:\n\n\nN &lt;- 50 \np &lt;- 0.52\nx &lt;- rbinom(N, 1, 0.5)\n\nThen compute a p-value testing if \\(p=0.5\\). Repeat this 10,000 times and report how often do we incorrectly is the p-value larger than 0.05? Note that you are computing 1 - power.\n\nRepeat exercise for but for the following values:\n\n\nvalues &lt;- expand.grid(N = c(25, 50, 100, 500, 1000), p = seq(0.51 ,0.75, 0.01))\n\nPlot power as a function of \\(N\\) with a different color curve for each value of p."
+    "text": "9.3 Exercises\n\nGenerate a sample of size \\(N=1000\\) from an urn model with 50% blue beads:\n\n\nN &lt;- 1000\np &lt;- 0.5\nx &lt;- rbinom(N, 1, 0.5)\n\nthen, compute a p-value to test if \\(p=0.5\\). Repeat this 10,000 times and report how often the p-value is lower than 0.05? How often is it lower than 0.01?\n\nMake a histogram of the p-values you generated in exercise 1. Which of the following seems to be true?\n\n\nThe p-values are all 0.05.\nThe p-values are normally distributed; CLT seems to hold.\nThe p-values are uniformly distributed.\nThe p-values are all less than 0.05.\n\n\nDemonstrate, mathematically, why see the histogram we see in exercise 2.\nGenerate a sample of size \\(N=1000\\) from an urn model with 52% blue beads:\n\n\nN &lt;- 1000 \np &lt;- 0.52\nx &lt;- rbinom(N, 1, 0.5)\n\nCompute a p-value to test if \\(p=0.5\\). Repeat this 10,000 times and report how often the p-value is larger than 0.05? Note that you are computing 1 - power.\n\nRepeat exercise for but for the following values:\n\n\nvalues &lt;- expand.grid(N = c(25, 50, 100, 500, 1000), p = seq(0.51 ,0.75, 0.01))\n\nPlot power as a function of \\(N\\) with a different color curve for each value of p."
+  },
+  {
+    "objectID": "inference/bootstrap.html#example-median-income",
+    "href": "inference/bootstrap.html#example-median-income",
+    "title": "10  Bootstrap",
+    "section": "\n10.1 Example: median income",
+    "text": "10.1 Example: median income\nSuppose the income distribution of your population is as follows:\n\nset.seed(1995)\nn &lt;- 10^6\nincome &lt;- 10^(rnorm(n, log10(45000), log10(3)))\nhist(income/10^3, nclass = 1000)\n\n\n\n\n\n\n\nThe population median is:\n\nm &lt;- median(income)\nm\n#&gt; [1] 44939\n\nSuppose we don’t have access to the entire population, but want to estimate the median \\(m\\). We take a sample of 100 and estimate the population median \\(m\\) with the sample median \\(M\\):\n\nN &lt;- 100\nx &lt;- sample(income, N)\nmedian(x)\n#&gt; [1] 38461"
+  },
+  {
+    "objectID": "inference/bootstrap.html#confidence-intervals-for-the-median",
+    "href": "inference/bootstrap.html#confidence-intervals-for-the-median",
+    "title": "10  Bootstrap",
+    "section": "\n10.2 Confidence intervals for the median",
+    "text": "10.2 Confidence intervals for the median\nCan we construct a confidence interval? What is the distribution of \\(M\\) ?\nBecause we are simulating the data, we can use a Monte Carlo simulation to learn the distribution of \\(M\\).\n\nlibrary(gridExtra)\nB &lt;- 10^4\nm &lt;- replicate(B, {\n  x &lt;- sample(income, N)\n  median(x)\n})\nhist(m, nclass = 30)\nqqnorm(scale(m)); abline(0,1)\n\n\n\n\n\n\n\n\n\nIf we know this distribution, we can construct a confidence interval. The problem here is that, as we have already described, in practice we do not have access to the distribution. In the past, we have used the Central Limit Theorem, but the CLT we studied applies to averages and here we are interested in the median. We can see that the 95% confidence interval based on CLT\n\nmedian(x) + 1.96*sd(x)/sqrt(N)*c(-1, 1)\n#&gt; [1] 21018 55905\n\nis quite different from the confidence interval we would generate if we know the actual distribution of \\(M\\):\n\nquantile(m, c(0.025, 0.975))\n#&gt;  2.5% 97.5% \n#&gt; 34438 59050\n\nThe bootstrap permits us to approximate a Monte Carlo simulation without access to the entire distribution. The general idea is relatively simple. We act as if the observed sample is the population. We then sample (with replacement) datasets, of the same sample size as the original dataset. Then we compute the summary statistic, in this case the median, on these bootstrap samples.\nTheory tells us that, in many situations, the distribution of the statistics obtained with bootstrap samples approximate the distribution of our actual statistic. This is how we construct bootstrap samples and an approximate distribution:\n\nB &lt;- 10^4\nm_star &lt;- replicate(B, {\n  x_star &lt;- sample(x, N, replace = TRUE)\n  median(x_star)\n})\n\nNote a confidence interval constructed with the bootstrap is much closer to one constructed with the theoretical distribution:\n\nquantile(m_star, c(0.025, 0.975))\n#&gt;  2.5% 97.5% \n#&gt; 30253 56909\n\nFor more on the Bootstrap, including corrections one can apply to improve these confidence intervals, please consult the book An introduction to the bootstrap by Efron, B., & Tibshirani, R. J."
+  },
+  {
+    "objectID": "inference/bootstrap.html#exercises",
+    "href": "inference/bootstrap.html#exercises",
+    "title": "10  Bootstrap",
+    "section": "\n10.3 Exercises",
+    "text": "10.3 Exercises\n1. Generate a random dataset like this:\n\ny &lt;- rnorm(100, 0, 1)\n\nEstimate the 75th quantile, which we know is:\n\nqnorm(0.75)\n\nwith the sample quantile:\n\nquantile(y, 0.75)\n\nRun a Monte Carlo simulation to learn the expected value and standard error of this random variable.\n2. In practice, we can’t run a Monte Carlo simulation because we don’t know if rnorm is being used to simulate the data. Use the bootstrap to estimate the standard error using just the initial sample y. Use 10 bootstrap samples.\n3. Redo exercise 12, but with 10,000 bootstrap samples."
   },
   {
     "objectID": "inference/models.html#case-study-poll-aggregators",
     "href": "inference/models.html#case-study-poll-aggregators",
-    "title": "10  Data-driven models",
-    "section": "\n10.1 Case study: poll aggregators",
-    "text": "10.1 Case study: poll aggregators\nAs we described earlier, a few weeks before the 2012 election Nate Silver was giving Obama a 90% chance of winning. How was Mr. Silver so confident? We will use a Monte Carlo simulation to illustrate the insight Mr. Silver had and others missed. To do this, we generate results for 12 polls taken the week before the election. We mimic sample sizes from actual polls and construct and report 95% confidence intervals for each of the 12 polls. We save the results from this simulation in a data frame and add a poll ID column.\n\nlibrary(tidyverse)\nlibrary(dslabs)\nmu &lt;- 0.039\nNs &lt;- c(1298, 533, 1342, 897, 774, 254, 812, 324, 1291, 1056, 2172, 516)\np &lt;- (mu + 1) / 2\n\npolls &lt;- map_df(Ns, function(N) {\n  x &lt;- sample(c(0, 1), size = N, replace = TRUE, prob = c(1 - p, p))\n  x_hat &lt;- mean(x)\n  se_hat &lt;- sqrt(x_hat * (1 - x_hat) / N)\n  list(estimate = 2 * x_hat - 1, \n    low = 2*(x_hat - 1.96*se_hat) - 1, \n    high = 2*(x_hat + 1.96*se_hat) - 1,\n    sample_size = N)\n}) |&gt; mutate(poll = seq_along(Ns))\n\nHere is a visualization showing the intervals the pollsters would have reported for the difference between Obama and Romney:\n\n\n\n\n\n\n\n\nNot surprisingly, all 12 polls report confidence intervals that include the election night result (dashed line). However, all 12 polls also include 0 (solid black line) as well. Therefore, if asked individually for a prediction, the pollsters would have to say: it’s a toss-up. Below we describe a key insight they are missing.\nPoll aggregators, such as Nate Silver, realized that by combining the results of different polls you could greatly improve precision. By doing this, we are effectively conducting a poll with a huge sample size. We can therefore report a smaller 95% confidence interval and a more precise prediction.\nAlthough as aggregators we do not have access to the raw poll data, we can use mathematics to reconstruct what we would have obtained had we made one large poll with:\n\nsum(polls$sample_size)\n#&gt; [1] 11269\n\nparticipants. Basically, we construct an estimate of the spread, let’s call it \\(\\mu\\), with a weighted average in the following way:\n\nmu_hat &lt;- polls |&gt; \n  summarize(avg = sum(estimate*sample_size) / sum(sample_size)) |&gt; \n  pull(avg)\n\nOnce we have an estimate of \\(\\mu\\), we can construct an estimate for the proportion voting for Obama, which we can then use to estimate the standard error. Once we do this, we see that our margin of error is 0.0184545.\nThus, we can predict that the spread will be 3.1 plus or minus 1.8, which not only includes the actual result we eventually observed on election night, but is quite far from including 0. Once we combine the 12 polls, we become quite certain that Obama will win the popular vote.\n\n\n\n\n\n\n\n\nHowevever, this was just a simulation to illustrate the idea. Let’s look at real data from 2016 presidential election. Specifically, the following subset of the polls_us_election_2016 data in dslabs which includes results for national polls, as well as state polls, taken during the year prior to the election and organized by FiveThirtyEight. For this first example, we will filter the data to include national polls conducted during the week before the election. We also remove polls that FiveThirtyEight has determined not to be reliable and graded with a “B” or less. Some polls have not been graded and we include those:\n\nlibrary(dslabs)\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(state == \"U.S.\" & enddate &gt;= \"2016-10-31\" &\n           (grade %in% c(\"A+\",\"A\",\"A-\",\"B+\") | is.na(grade)))\n\nWe add a spread estimate:\n\npolls &lt;- polls |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)\n\nFor this example, we will assume that there are only two parties and call \\(p\\) the proportion voting for Clinton and \\(1-p\\) the proportion voting for Trump. We are interested in the spread \\(2p-1\\). Let’s call the spread \\(\\mu\\) (for difference).\nWe have 49 estimates of the spread. The theory we learned from sampling models tells us that these estimates are a random variable with a probability distribution that is approximately normal. The expected value is the election night spread \\(\\mu\\) and the standard error is \\(2\\sqrt{p (1 - p) / N}\\). Assuming the urn model we described earlier is a good one, we can use this information to construct a confidence interval based on the aggregated data. The estimated spread is:\n\nmu_hat &lt;- polls |&gt; \n  summarize(mu_hat = sum(spread * samplesize) / sum(samplesize)) |&gt; \n  pull(mu_hat)\n\nand the standard error is:\n\np_hat &lt;- (mu_hat + 1)/2 \nmoe &lt;- 1.96 * 2 * sqrt(p_hat * (1 - p_hat) / sum(polls$samplesize))\nmoe\n#&gt; [1] 0.00662\n\nSo we report a spread of 1.43% with a margin of error of 0.66%. On election night, we discover that the actual percentage was 2.1%, which is outside a 95% confidence interval. What happened?\nA histogram of the reported spreads shows a problem:\n\npolls |&gt; ggplot(aes(spread)) + geom_histogram(color = \"black\", binwidth = .01)\n\n\n\n\n\n\n\nThe data does not appear to be normally distributed and the standard error appears to be larger than 0.0066232. The theory is not working here and in the next section we describe a useful data-driven model."
+    "title": "11  Data-driven models",
+    "section": "\n11.1 Case study: poll aggregators",
+    "text": "11.1 Case study: poll aggregators\nAs we described earlier, a few weeks before the 2012 election, Nate Silver was giving Obama a 90% chance of winning. How was Mr. Silver so confident? We will use a Monte Carlo simulation to illustrate the insight Mr. Silver had, and which others missed. To do this, we generate results for 12 polls taken the week before the election. We mimic sample sizes from actual polls, construct, and report 95% confidence intervals for each of the 12 polls. We save the results from this simulation in a data frame and add a poll ID column.\n\nlibrary(tidyverse)\nlibrary(dslabs)\nmu &lt;- 0.039\nNs &lt;- c(1298, 533, 1342, 897, 774, 254, 812, 324, 1291, 1056, 2172, 516)\np &lt;- (mu + 1) / 2\n\npolls &lt;- map_df(Ns, function(N) {\n  x &lt;- sample(c(0, 1), size = N, replace = TRUE, prob = c(1 - p, p))\n  x_hat &lt;- mean(x)\n  se_hat &lt;- sqrt(x_hat * (1 - x_hat) / N)\n  list(estimate = 2 * x_hat - 1, \n    low = 2*(x_hat - 1.96*se_hat) - 1, \n    high = 2*(x_hat + 1.96*se_hat) - 1,\n    sample_size = N)\n}) |&gt; mutate(poll = seq_along(Ns))\n\nHere is a visualization showing the intervals that the pollsters would have reported for the difference between Obama and Romney:\n\n\n\n\n\n\n\n\nNot surprisingly, all 12 polls report confidence intervals that include the election night result (dashed line). However, all 12 polls also include 0 (solid black line) as well. Therefore, if asked individually for a prediction, the pollsters would have to say: it’s a toss-up. Below, we describe a key insight they are missing.\nPoll aggregators, such as Nate Silver, realized that by combining the results of different polls you could greatly improve precision. By doing this, we are effectively conducting a poll with a huge sample size. We can, therefore, report a smaller 95% confidence interval and a more precise prediction.\nAlthough, as aggregators, we do not have access to the raw poll data, we can use mathematics to reconstruct what we would have obtained had we made one large poll with:\n\nsum(polls$sample_size)\n#&gt; [1] 11269\n\nparticipants. Basically, we construct an estimate of the spread, let’s call it \\(\\mu\\), with a weighted average in the following way:\n\nmu_hat &lt;- polls |&gt; \n  summarize(avg = sum(estimate*sample_size) / sum(sample_size)) |&gt; \n  pull(avg)\n\nOnce we have an estimate of \\(\\mu\\), we can construct an estimate for the proportion voting for Obama, which we can then use to estimate the standard error. Once we do this, we see that our margin of error is 0.0184545.\nThus, we can predict that the spread will be 3.1, plus or minus 1.8, which not only includes the actual result we eventually observed on election night, but is quite far from including 0. Once we combine the 12 polls, we become quite certain that Obama will win the popular vote.\n\n\n\n\n\n\n\n\nHowever, this was just a simulation to illustrate the idea. Let’s look at real data from the 2016 presidential election. Specifically, the following subset of the polls_us_election_2016 data in dslabs includes results for national polls as well as state polls taken during the year prior to the election and organized by FiveThirtyEight. For this first example, we will filter the data to include national polls conducted during the week before the election. We also remove polls that FiveThirtyEight has determined to be unreliable and graded with a “B” or less. Some polls have not been graded, and we include those:\n\nlibrary(dslabs)\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(state == \"U.S.\" & enddate &gt;= \"2016-10-31\" &\n           (grade %in% c(\"A+\",\"A\",\"A-\",\"B+\") | is.na(grade)))\n\nWe add a spread estimate:\n\npolls &lt;- polls |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)\n\nFor this example, we will assume that there are only two parties, and call \\(p\\) the proportion voting for Clinton and \\(1-p\\) the proportion voting for Trump. We are interested in the spread \\(2p-1\\). Let’s call the spread \\(\\mu\\) (for difference).\nWe have 49 estimates of the spread. The theory we learned from sampling models tells us that these estimates are a random variable with a probability distribution that is approximately normal. The expected value is the election night spread \\(\\mu\\) and the standard error is \\(2\\sqrt{p (1 - p) / N}\\). Assuming the urn model we described earlier is a good one, we can use this information to construct a confidence interval based on the aggregated data. The estimated spread is:\n\nmu_hat &lt;- polls |&gt; \n  summarize(mu_hat = sum(spread*samplesize)/sum(samplesize)) |&gt; \n  pull(mu_hat)\n\nand the standard error is:\n\np_hat &lt;- (mu_hat + 1)/2 \nmoe &lt;- 1.96*2*sqrt(p_hat*(1 - p_hat)/sum(polls$samplesize))\nmoe\n#&gt; [1] 0.00662\n\nSo we report a spread of 1.43% with a margin of error of 0.66%. On election night, we discover that the actual percentage was 2.1%, which is outside a 95% confidence interval. What happened?\nA histogram of the reported spreads reveals a problem:\n\npolls |&gt; ggplot(aes(spread)) + geom_histogram(color = \"black\", binwidth = .01)\n\n\n\n\n\n\n\nThe data does not appear to be normally distributed, and the standard error appears to be larger than 0.0066232. The theory is not working here, and in the next section, we describe a useful data-driven model."
   },
   {
     "objectID": "inference/models.html#sample-avg-model",
     "href": "inference/models.html#sample-avg-model",
-    "title": "10  Data-driven models",
-    "section": "\n10.2 Beyond the simple sampling model",
-    "text": "10.2 Beyond the simple sampling model\nNotice that data come various pollsters and some are taking several polls a week:\n\npolls |&gt; group_by(pollster) |&gt; summarize(n())\n#&gt; # A tibble: 15 × 2\n#&gt;   pollster                                                   `n()`\n#&gt;   &lt;fct&gt;                                                      &lt;int&gt;\n#&gt; 1 ABC News/Washington Post                                       7\n#&gt; 2 Angus Reid Global                                              1\n#&gt; 3 CBS News/New York Times                                        2\n#&gt; 4 Fox News/Anderson Robbins Research/Shaw & Company Research     2\n#&gt; 5 IBD/TIPP                                                       8\n#&gt; # ℹ 10 more rows\n\nLet’s visualize the data for the pollsters that are regularly polling:\n\n\n\n\n\n\n\n\nThis plot reveals an unexpected result. First, consider that the standard error predicted by theory for each poll is between 0.018 and 0.033:\n\npolls |&gt; group_by(pollster) |&gt; \n  filter(n() &gt;= 6) |&gt;\n  summarize(se = 2 * sqrt(p_hat * (1 - p_hat) / median(samplesize)))\n#&gt; # A tibble: 5 × 2\n#&gt;   pollster                     se\n#&gt;   &lt;fct&gt;                     &lt;dbl&gt;\n#&gt; 1 ABC News/Washington Post 0.0265\n#&gt; 2 IBD/TIPP                 0.0333\n#&gt; 3 Ipsos                    0.0225\n#&gt; 4 The Times-Picayune/Lucid 0.0196\n#&gt; 5 USC Dornsife/LA Times    0.0183\n\nThis agrees with the within poll variation we see. However, there appears to be differences across the polls. Note, for example, how the USC Dornsife/LA Times pollster is predicting a 4% win for Trump, while Ipsos is predicting a win larger than 5% for Clinton. The theory we learned says nothing about different pollsters producing polls with different expected values: all the polls should have the same expected value. FiveThirtyEight refers to these differences as house effects. We also call them pollster bias. Nothing in our simple urn model provides an explanation for these pollster-to-pollster differences. This model misspesification led to an overconfident interval that ended up not inclding the election nigth result. So, rather than modeling the process generating these values with an urn model, we instead model the pollster results directly. To do this, we start by collecting some data. Specifically, for each pollster we look at the last reported result before the election:\n\none_poll_per_pollster &lt;- polls |&gt; group_by(pollster) |&gt; \n  filter(enddate == max(enddate)) |&gt;\n  ungroup()\n\nHere is a histogram of the data for these 15 pollsters:\n\nqplot(spread, data = one_poll_per_pollster, binwidth = 0.01)\n#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.\n\n\n\n\n\n\n\nAlthough we are no longer using a model with red (Repbulicans) and blue (Democrate) beads in an urn, our new model can also be thought of as an urn mode but containing poll results from all possible pollsters and think of our $N=$15 data points \\(X_1,\\dots X_N\\) a as a random sample from this urn. To develop a useful model, we assume that the expected value of our urn is the actual spread \\(\\mu=2p-1\\), which implies that the sample average has expected value \\(\\mu\\).\nNow, because instead of 0s and 1s, our urn contains continuous numbers, the standard deviation of the urn is no longer \\(\\sqrt{p(1-p)}\\). Rather than voter sampling variability, the standard error now includes the pollster-to-pollster variability. Our new urn also includes the sampling variability from the polling. Regardless, this standard deviation is now an unknown parameter. In statistics textbooks, the Greek symbol \\(\\sigma\\) is used to represent this parameter.\nSo our new statistical model is that \\(X_1, \\dots, X_N\\) are a random sample with expected \\(\\mu\\) and standard deviation \\(\\sigma\\). The distribution, for now, is unspecified. But we consider \\(N\\) to be large enough to assume that the sample average \\(\\bar{X} = \\sum_{i=1}^N X_i\\) follows a normal distribution with expected value \\(\\mu\\) and standard error \\(\\sigma / \\sqrt{N}\\). We write\n\\[\n\\bar{X} \\sim \\mbox{N}(\\mu, \\sigma / \\sqrt{N})\n\\] Here the \\(\\sim\\) symbol tells us the random variable on the left of the symbol follows the distribution on the right. We use the notation \\(N(a,b)\\) to represent the normal distribution with mean \\(a\\) and standard deviation \\(b\\).\n\nThis model for the sample average will be used again the next chapter.\n\n\n10.2.1 Estimating the standard deviation\nThe model we have specfied has two unknown parameters: the expected value \\(\\mu\\) and the standard deviation \\(\\sigma\\). We know that the sample average \\(\\bar{X}\\) will be our estimte of \\(\\mu\\). But what about \\(\\sigma\\)?\nOur task is to estimate \\(\\mu\\). Because we model the observed values \\(X_1,\\dots X_N\\) as a random sample from the urn, for a large enough sample size \\(N\\), the probability distribution of the sample average \\(\\bar{X}\\) is approximately normal with expected value \\(\\mu\\) and standard error \\(\\sigma/\\sqrt{N}\\). If we are willing to consider \\(N=15\\) large enough, we can use this to construct confidence intervals.\nTheory tells us that we can estimate the urn model \\(\\sigma\\) with the sample standard deviation defined as\n\\[\ns = \\sqrt{ \\frac{1}{N-1} \\sum_{i=1}^N (X_i - \\bar{X})^2 }\n\\]\nNote that unlike for the population standard deviation definition, we now divide by \\(N-1\\). This makes \\(s\\) a better estimate of \\(\\sigma\\). There is a mathematical explanation for this, which is explained in most statistics textbooks, but we don’t cover it here.\nThe sd function in R computes the sample standard deviation:\n\nsd(one_poll_per_pollster$spread)\n#&gt; [1] 0.0242\n\n\n10.2.2 Computing a confidence interval\nWe are now ready to form a new confidence interval based on our new data-driven model:\n\nresults &lt;- one_poll_per_pollster |&gt; \n  summarize(avg = mean(spread), \n            se = sd(spread) / sqrt(length(spread))) |&gt; \n  mutate(start = avg - 1.96 * se, \n         end = avg + 1.96 * se) \nround(results * 100, 1)\n#&gt;   avg  se start end\n#&gt; 1 2.9 0.6   1.7 4.1\n\nOur confidence interval is wider now since it incorporates the pollster variability. It does include the election night result of 2.1%. Also, note that it was small enough not to include 0, which means we were confident Clinton would win the popular vote.\n\n10.2.3 The t-distribution\nAbove we made use of the CLT with a sample size of 15. Because we are estimating a second parameters \\(\\sigma\\), further variability is introduced into our confidence interval which results in intervals that are too small. For very large sample sizes this extra variability is negligible, but, in general, for values smaller than 30 we need to be cautious about using the CLT. However, if the data in the urn is known to follow a normal distribution, then we actually have mathematical theory that tells us how much bigger we need to make the intervals to account for the estimation of \\(\\sigma\\). Using this theory, we can construct confidence intervals for any \\(N\\). But again, this works only if the data in the urn is known to follow a normal distribution. So for the 0, 1 data of our previous urn model, this theory definitely does not apply.\n\n\n\n\n\n\nNote that 30 is a very general rule of thumb based on the case when the data come from a normal distribution. There are cases when a large sample size is needed as well as cases when smaller sample sizes are good enough.\n\n\n\nThe statistic on which confidence intervals for \\(\\mu\\) are based is\n\\[\nZ = \\frac{\\bar{X} - \\mu}{\\sigma/\\sqrt{N}}\n\\]\nCLT tells us that Z is approximately normally distributed with expected value 0 and standard error 1. But in practice we don’t know \\(\\sigma\\) so we use:\n\\[\nt = \\frac{\\bar{X} - \\mu}{s/\\sqrt{N}}\n\\]\nThis is referred to a t-statistic. By substituting \\(\\sigma\\) with \\(s\\) we introduce some variability. The theory tells us that \\(t\\) follows a student t-distribution with \\(N-1\\) degrees of freedom. The degrees of freedom is a parameter that controls the variability via fatter tails:\n\n\n\n\n\n\n\n\nIf we are willing to assume the pollster effect data is normally distributed, based on the sample data \\(X_1, \\dots, X_N\\),\n\none_poll_per_pollster |&gt;\n  ggplot(aes(sample = spread)) + stat_qq()\n\n\n\n\n\n\n\nthen \\(t\\) follows a t-distribution with \\(N-1\\) degrees of freedom. So perhaps a better confidence interval for \\(\\mu\\) is:\n\nz &lt;- qt(0.975,  nrow(one_poll_per_pollster) - 1)\none_poll_per_pollster |&gt; \n  summarize(avg = mean(spread), moe = z*sd(spread)/sqrt(length(spread))) |&gt; \n  mutate(start = avg - moe, end = avg + moe) \n#&gt; # A tibble: 1 × 4\n#&gt;      avg    moe  start    end\n#&gt;    &lt;dbl&gt;  &lt;dbl&gt;  &lt;dbl&gt;  &lt;dbl&gt;\n#&gt; 1 0.0290 0.0134 0.0156 0.0424\n\nA bit larger than the one using normal is\n\nqt(0.975, 14)\n#&gt; [1] 2.14\n\nis bigger than\n\nqnorm(0.975)\n#&gt; [1] 1.96\n\nThis results in slightly larger confidence interval than we obtained before:\n\n#&gt;   start end\n#&gt; 1   1.6 4.2\n\nNote that using the t-distribution and the t-statistic is the basis for t-tests, widely used approach for computing p-values. To learn more about t-tests, you can consult any statistics textbook.\nThe t-distribution can also be used to model errors in bigger deviations that are more likely than with the normal distribution, as seen in the densities we previously saw. Fivethirtyeight uses the t-distribution to generate errors that better model the deviations we see in election data. For example, in Wisconsin the average of six polls was 7% in favor of Clinton with a standard deviation of 1%, but Trump won by 0.7%. Even after taking into account the overall bias, this 7.7% residual is more in line with t-distributed data than the normal distribution.\n\npolls_us_election_2016 |&gt;\n  filter(state == \"Wisconsin\" &\n           enddate &gt;= \"2016-10-31\" & \n           (grade %in% c(\"A+\", \"A\", \"A-\", \"B+\") | is.na(grade))) |&gt;\n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100) |&gt;\n  mutate(state = as.character(state)) |&gt;\n  left_join(results_us_election_2016, by = \"state\") |&gt;\n  mutate(actual = clinton/100 - trump/100) |&gt;\n  summarize(actual = first(actual), avg = mean(spread), \n            sd = sd(spread), n = n()) |&gt;\n  select(actual, avg, sd, n)\n#&gt;   actual    avg     sd n\n#&gt; 1 -0.007 0.0711 0.0104 6"
+    "title": "11  Data-driven models",
+    "section": "\n11.2 Beyond the simple sampling model",
+    "text": "11.2 Beyond the simple sampling model\nNotice that data come from various pollsters, and some are taking several polls a week:\n\npolls |&gt; group_by(pollster) |&gt; summarize(n())\n#&gt; # A tibble: 15 × 2\n#&gt;   pollster                                                   `n()`\n#&gt;   &lt;fct&gt;                                                      &lt;int&gt;\n#&gt; 1 ABC News/Washington Post                                       7\n#&gt; 2 Angus Reid Global                                              1\n#&gt; 3 CBS News/New York Times                                        2\n#&gt; 4 Fox News/Anderson Robbins Research/Shaw & Company Research     2\n#&gt; 5 IBD/TIPP                                                       8\n#&gt; # ℹ 10 more rows\n\nLet’s visualize the data for the pollsters that are regularly polling:\n\n\n\n\n\n\n\n\nThis plot reveals an unexpected result. First, consider that the standard error predicted by theory for each poll is between 0.018 and 0.033:\n\npolls |&gt; group_by(pollster) |&gt; \n  filter(n() &gt;= 6) |&gt;\n  summarize(se = 2*sqrt(p_hat*(1 - p_hat)/median(samplesize)))\n#&gt; # A tibble: 5 × 2\n#&gt;   pollster                     se\n#&gt;   &lt;fct&gt;                     &lt;dbl&gt;\n#&gt; 1 ABC News/Washington Post 0.0265\n#&gt; 2 IBD/TIPP                 0.0333\n#&gt; 3 Ipsos                    0.0225\n#&gt; 4 The Times-Picayune/Lucid 0.0196\n#&gt; 5 USC Dornsife/LA Times    0.0183\n\nThis agrees with the within poll variation we see. However, there appears to be differences across the polls. Observe, for example, how the USC Dornsife/LA Times pollster is predicting a 4% lead for Trump, while Ipsos is predicting a lead larger than 5% for Clinton. The theory we learned says nothing about different pollsters producing polls with different expected values, instead it assumes all the polls have the same expected value. FiveThirtyEight refers to these differences as house effects. We also call them pollster bias. Nothing in our simple urn model provides an explanation for these pollster-to-pollster differences.\nThis model misspecification led to an overconfident interval that ended up not including the election night result. So, rather than modeling the process generating these values with an urn model, we instead model the pollster results directly. To do this, we start by collecting some data. Specifically, for each pollster, we look at the last reported result before the election:\n\none_poll_per_pollster &lt;- polls |&gt; group_by(pollster) |&gt; \n  filter(enddate == max(enddate)) |&gt;\n  ungroup()\n\nHere is a histogram of the data for these 15 pollsters:\n\nqplot(spread, data = one_poll_per_pollster, binwidth = 0.01)\n#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.\n\n\n\n\n\n\n\nAlthough we are no longer using a model with red (Republicans) and blue (Democrats) beads in an urn, our new model can also be thought of as an urn model, but containing poll results from all possible pollsters. Think of our $N=$15 data points \\(X_1,\\dots X_N\\) as a random sample from this urn. To develop a useful model, we assume that the expected value of our urn is the actual spread \\(\\mu=2p-1\\), which implies that the sample average has expected value \\(\\mu\\).\nNow, because instead of 0s and 1s, our urn contains continuous numbers, the standard deviation of the urn is no longer \\(\\sqrt{p(1-p)}\\). Rather than voter sampling variability, the standard error now includes the pollster-to-pollster variability. Our new urn also includes the sampling variability from the polling. Regardless, this standard deviation is now an unknown parameter. In statistics textbooks, the Greek symbol \\(\\sigma\\) is used to represent this parameter.\nSo our new statistical model is that \\(X_1, \\dots, X_N\\) are a random sample with expected \\(\\mu\\) and standard deviation \\(\\sigma\\). The distribution, for now, is unspecified. But we consider \\(N\\) to be large enough to assume that the sample average \\(\\bar{X} = \\sum_{i=1}^N X_i\\) follows a normal distribution with expected value \\(\\mu\\) and standard error \\(\\sigma / \\sqrt{N}\\). We write:\n\\[\n\\bar{X} \\sim \\mbox{N}(\\mu, \\sigma / \\sqrt{N})\n\\] Here the \\(\\sim\\) symbol tells us that the random variable on the left of the symbol follows the distribution on the right. We use the notation \\(N(a,b)\\) to represent the normal distribution with mean \\(a\\) and standard deviation \\(b\\).\n\nThis model for the sample average will be used again the next chapter.\n\n\n11.2.1 Estimating the standard deviation\nThe model we have specified has two unknown parameters: the expected value \\(\\mu\\) and the standard deviation \\(\\sigma\\). We know that the sample average \\(\\bar{X}\\) will be our estimate of \\(\\mu\\). But what about \\(\\sigma\\)?\nOur task is to estimate \\(\\mu\\). Given that we model the observed values \\(X_1,\\dots X_N\\) as a random sample from the urn, for a large enough sample size \\(N\\), the probability distribution of the sample average \\(\\bar{X}\\) is approximately normal with expected value \\(\\mu\\) and standard error \\(\\sigma/\\sqrt{N}\\). If we are willing to consider \\(N=15\\) large enough, we can use this to construct confidence intervals.\nTheory tells us that we can estimate the urn model \\(\\sigma\\) with the sample standard deviation defined as:\n\\[\ns = \\sqrt{ \\frac{1}{N-1} \\sum_{i=1}^N (X_i - \\bar{X})^2 }\n\\]\nKeep in mind that, unlike for the population standard deviation definition, we now divide by \\(N-1\\). This makes \\(s\\) a better estimate of \\(\\sigma\\). There is a mathematical explanation for this, which is explained in most statistics textbooks, but we do not cover it here.\nThe sd function in R computes the sample standard deviation:\n\nsd(one_poll_per_pollster$spread)\n#&gt; [1] 0.0242\n\n\n11.2.2 Computing a confidence interval\nWe are now ready to form a new confidence interval based on our new data-driven model:\n\nresults &lt;- one_poll_per_pollster |&gt; \n  summarize(avg = mean(spread), \n            se = sd(spread)/sqrt(length(spread))) |&gt; \n  mutate(start = avg - 1.96*se, \n         end = avg + 1.96*se) \nround(results*100, 1)\n#&gt;   avg  se start end\n#&gt; 1 2.9 0.6   1.7 4.1\n\nOur confidence interval is wider now since it incorporates the pollster variability. It does include the election night result of 2.1%. Also, note that it was small enough not to include 0, which means we were confident Clinton would win the popular vote.\n\n11.2.3 The t-distribution\nAbove, we made use of the CLT with a sample size of 15. Because we are estimating a second parameters \\(\\sigma\\), further variability is introduced into our confidence interval, which results in intervals that are too small. For very large sample sizes, this extra variability is negligible, but in general, for values smaller than 30, we need to be cautious about using the CLT. However, if the data in the urn is known to follow a normal distribution, then we actually have mathematical theory that tells us how much bigger we need to make the intervals to account for the estimation of \\(\\sigma\\). Applying this theory, we can construct confidence intervals for any \\(N\\). But again, this works only if the data in the urn is known to follow a normal distribution. So for the 0, 1 data of our previous urn model, this theory definitely does not apply.\n\n\n\n\n\n\nNote that 30 is a very general rule of thumb based on the case when the data come from a normal distribution. There are cases when a large sample size is needed as well as cases when smaller sample sizes are good enough.\n\n\n\nThe statistic on which confidence intervals for \\(\\mu\\) are based is:\n\\[\nZ = \\frac{\\bar{X} - \\mu}{\\sigma/\\sqrt{N}}\n\\]\nCLT tells us that Z is approximately normally distributed with expected value 0 and standard error 1. But in practice we don’t know \\(\\sigma\\), so we use:\n\\[\nt = \\frac{\\bar{X} - \\mu}{s/\\sqrt{N}}\n\\]\nThis is referred to as a t-statistic. By substituting \\(\\sigma\\) with \\(s\\), we introduce some variability. The theory tells us that \\(t\\) follows a student t-distribution with \\(N-1\\) degrees of freedom. The degrees of freedom is a parameter that controls the variability via fatter tails:\n\n\n\n\n\n\n\n\nIf we are willing to assume the pollster effect data is normally distributed, based on the sample data \\(X_1, \\dots, X_N\\),\n\none_poll_per_pollster |&gt;\n  ggplot(aes(sample = spread)) + stat_qq()\n\n\n\n\n\n\n\nthen \\(t\\) follows a t-distribution with \\(N-1\\) degrees of freedom. So perhaps a better confidence interval for \\(\\mu\\) is:\n\nz &lt;- qt(0.975,  nrow(one_poll_per_pollster) - 1)\none_poll_per_pollster |&gt; \n  summarize(avg = mean(spread), moe = z*sd(spread)/sqrt(length(spread))) |&gt; \n  mutate(start = avg - moe, end = avg + moe) \n#&gt; # A tibble: 1 × 4\n#&gt;      avg    moe  start    end\n#&gt;    &lt;dbl&gt;  &lt;dbl&gt;  &lt;dbl&gt;  &lt;dbl&gt;\n#&gt; 1 0.0290 0.0134 0.0156 0.0424\n\nA bit larger than the one using normal is:\n\nqt(0.975, 14)\n#&gt; [1] 2.14\n\nis bigger than:\n\nqnorm(0.975)\n#&gt; [1] 1.96\n\nThis results in a slightly larger confidence interval than we obtained before:\n\n#&gt;   start end\n#&gt; 1   1.6 4.2\n\nNote that using the t-distribution and the t-statistic is the basis for t-tests, a widely used approach for computing p-values. To learn more about t-tests, you can consult any statistics textbook.\nThe t-distribution can also be used to model errors in bigger deviations that are more likely than with the normal distribution, as seen in the densities we previously observed. FiveThirtyEight uses the t-distribution to generate errors that better model the deviations we see in election data. For example, in Wisconsin, the average of six polls was 7% in favor of Clinton with a standard deviation of 1%, but Trump won by 0.7%. Even after taking into account the overall bias, this 7.7% residual is more in line with t-distributed data than the normal distribution.\n\npolls_us_election_2016 |&gt;\n  filter(state == \"Wisconsin\" &\n           enddate &gt;= \"2016-10-31\" & \n           (grade %in% c(\"A+\", \"A\", \"A-\", \"B+\") | is.na(grade))) |&gt;\n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100) |&gt;\n  mutate(state = as.character(state)) |&gt;\n  left_join(results_us_election_2016, by = \"state\") |&gt;\n  mutate(actual = clinton/100 - trump/100) |&gt;\n  summarize(actual = first(actual), avg = mean(spread), \n            sd = sd(spread), n = n()) |&gt;\n  select(actual, avg, sd, n)\n#&gt;   actual    avg     sd n\n#&gt; 1 -0.007 0.0711 0.0104 6"
   },
   {
     "objectID": "inference/models.html#exercises",
     "href": "inference/models.html#exercises",
-    "title": "10  Data-driven models",
-    "section": "\n10.3 Exercises",
-    "text": "10.3 Exercises\nWe have been using urn models to motivate the use of probability models. Most data science applications are not related to data obtained from urns. More common are data that come from individuals. The reason probability plays a role here is because the data come from a random sample. The random sample is taken from a population and the urn serves as an analogy for the population.\nLet’s revisit the heights dataset. Suppose we consider the males in our course the population.\n\nlibrary(dslabs)\nx &lt;- heights |&gt; filter(sex == \"Male\") |&gt;\n  pull(height)\n\n1. Mathematically speaking, x is our population. Using the urn analogy, we have an urn with the values of x in it. What are the average and standard deviation of our population?\n2. Call the population average computed above \\(\\mu\\) and the standard deviation \\(\\sigma\\). Now take a sample of size 50, with replacement, and construct an estimate for \\(\\mu\\) and \\(\\sigma\\).\n3. What does the theory tell us about the sample average \\(\\bar{X}\\) and how it is related to \\(\\mu\\)?\n\nIt is practically identical to \\(\\mu\\).\nIt is a random variable with expected value \\(\\mu\\) and standard error \\(\\sigma/\\sqrt{N}\\).\nIt is a random variable with expected value \\(\\mu\\) and standard error \\(\\sigma\\).\nContains no information.\n\n4. So how is this useful? We are going to use an oversimplified yet illustrative example. Suppose we want to know the average height of our male students, but we only get to measure 50 of the 708. We will use \\(\\bar{X}\\) as our estimate. We know from the answer to exercise 3 that the standard estimate of our error \\(\\bar{X}-\\mu\\) is \\(\\sigma/\\sqrt{N}\\). We want to compute this, but we don’t know \\(\\sigma\\). Based on what is described in this section, show your estimate of \\(\\sigma\\).\n5. Now that we have an estimate of \\(\\sigma\\), let’s call our estimate \\(s\\). Construct a 95% confidence interval for \\(\\mu\\).\n6. Now run a Monte Carlo simulation in which you compute 10,000 confidence intervals as you have just done. What proportion of these intervals include \\(\\mu\\)?\n7. Use the qnorm and qt functions to generate quantiles. Compare these quantiles for different degrees of freedom for the t-distribution. Use this to motivate the sample size of 30 rule of thumb."
+    "title": "11  Data-driven models",
+    "section": "\n11.3 Exercises",
+    "text": "11.3 Exercises\nWe have been using urn models to motivate the use of probability models. Yet, most data science applications are not related to data obtained from urns. More common are data that come from individuals. The reason probability plays a role here is because the data come from a random sample. The random sample is taken from a population, and the urn serves as an analogy for the population.\nDefine the males that replied to the height survey as the population\n\nlibrary(dslabs)\nx &lt;- heights |&gt; filter(sex == \"Male\") |&gt;\n  pull(height)\n\nto answer the following questions.\n1. Mathematically speaking, x is our population. Using the urn analogy, we have an urn with the values of x in it. What are the average and standard deviation of our population?\n2. Call the population average computed above \\(\\mu\\) and the standard deviation \\(\\sigma\\). Now take a sample of size 50, with replacement, and construct an estimate for \\(\\mu\\) and \\(\\sigma\\).\n3. What does the theory tell us about the sample average \\(\\bar{X}\\) and how it is related to \\(\\mu\\)?\n\nIt is practically identical to \\(\\mu\\).\nIt is a random variable with expected value \\(\\mu\\) and standard error \\(\\sigma/\\sqrt{N}\\).\nIt is a random variable with expected value \\(\\mu\\) and standard error \\(\\sigma\\).\nContains no information.\n\n4. So, how is this useful? We are going to use an oversimplified yet illustrative example. Suppose we want to know the average height of our male students, but we can only measure 50 of the 708. We will use \\(\\bar{X}\\) as our estimate. We know from the answer to exercise 3 that the standard estimate of our error \\(\\bar{X}-\\mu\\) is \\(\\sigma/\\sqrt{N}\\). We want to compute this, but we don’t know \\(\\sigma\\). Based on what is described in this section, show your estimate of \\(\\sigma\\).\n5. Now that we have an estimate of \\(\\sigma\\), let’s call our estimate \\(s\\). Construct a 95% confidence interval for \\(\\mu\\).\n6. Now run a Monte Carlo simulation in which you compute 10,000 confidence intervals as you have just done. What proportion of these intervals include \\(\\mu\\)?\n7. Use the qnorm and qt functions to generate quantiles. Compare these quantiles for different degrees of freedom for the t-distribution. Use this to motivate the sample size of 30 rule of thumb."
   },
   {
     "objectID": "inference/bayes.html#bayes-theorem",
     "href": "inference/bayes.html#bayes-theorem",
-    "title": "11  Bayesian statistics",
-    "section": "\n11.1 Bayes theorem",
-    "text": "11.1 Bayes theorem\nWe start by describing Bayes theorem. We do this using a hypothetical cystic fibrosis test as an example. Suppose a test for cystic fibrosis has an accuracy of 99%. We will use the following notation:\n\\[\n\\mbox{Prob}(+ \\mid D=1)=0.99, \\mbox{Prob}(- \\mid D=0)=0.99\n\\]\nwith \\(+\\) meaning a positive test and \\(D\\) representing if you actually have the disease (1) or not (0).\nSuppose we select a random person and they test positive. What is the probability that they have the disease? We write this as \\(\\mbox{Prob}(D=1 \\mid +)?\\) The cystic fibrosis rate is 1 in 3,900 which implies that \\(\\mbox{Prob}(D=1)=0.00025\\). To answer this question, we will use Bayes theorem, which in general tells us that:\n\\[\n\\mbox{Pr}(A \\mid B)  =  \\frac{\\mbox{Pr}(B \\mid A)\\mbox{Pr}(A)}{\\mbox{Pr}(B)}\n\\]\nThis equation applied to our problem becomes:\n\\[\n\\begin{aligned}\n\\mbox{Pr}(D=1 \\mid +) & =  \\frac{ P(+ \\mid D=1) \\cdot P(D=1)} {\\mbox{Pr}(+)} \\\\\n& =  \\frac{\\mbox{Pr}(+ \\mid D=1)\\cdot P(D=1)} {\\mbox{Pr}(+ \\mid D=1) \\cdot P(D=1) + \\mbox{Pr}(+ \\mid D=0) \\mbox{Pr}( D=0)}\n\\end{aligned}\n\\]\nPlugging in the numbers we get:\n\\[\n\\frac{0.99 \\cdot 0.00025}{0.99 \\cdot 0.00025 + 0.01 \\cdot (.99975)}  =  0.02\n\\]\nThis says that despite the test having 0.99 accuracy, the probability of having the disease given a positive test is only 0.02. This may appear counter-intuitive to some, but the reason this is the case is because we have to factor in the very rare probability that a person, chosen at random, has the disease. To illustrate this, we run a Monte Carlo simulation.\n\n11.1.1 Bayes theorem simulation\nThe following simulation is meant to help you visualize Bayes theorem. We start by randomly selecting 100,000 people from a population in which the disease in question has a 1 in 4,000 prevalence.\n\nprev &lt;- 0.00025\nN &lt;- 100000\noutcome &lt;- sample(c(\"Disease\",\"Healthy\"), N, replace = TRUE, \n                  prob = c(prev, 1 - prev))\n\nNote that there are very few people with the disease:\n\nN_D &lt;- sum(outcome == \"Disease\")\nN_D\n#&gt; [1] 23\nN_H &lt;- sum(outcome == \"Healthy\")\nN_H\n#&gt; [1] 99977\n\nAlso, there are many without the disease, which makes it more probable that we will see some false positives given that the test is not perfect. Now each person gets the test, which is correct 99% of the time:\n\naccuracy &lt;- 0.99\ntest &lt;- vector(\"character\", N)\ntest[outcome == \"Disease\"]  &lt;- sample(c(\"+\", \"-\"), N_D, replace = TRUE, \n                                    prob = c(accuracy, 1 - accuracy))\ntest[outcome == \"Healthy\"]  &lt;- sample(c(\"-\", \"+\"), N_H, replace = TRUE, \n                                    prob = c(accuracy, 1 - accuracy))\n\nBecause there are so many more controls than cases, even with a low false positive rate we get more controls than cases in the group that tested positive:\n\ntable(outcome, test)\n#&gt;          test\n#&gt; outcome       -     +\n#&gt;   Disease     0    23\n#&gt;   Healthy 99012   965\n\nFrom this table, we see that the proportion of positive tests that have the disease is 23 out of 988. We can run this over and over again to see that, in fact, the probability converges to about 0.022."
+    "title": "12  Bayesian statistics",
+    "section": "\n12.1 Bayes theorem",
+    "text": "12.1 Bayes theorem\nWe start by describing Bayes theorem, using a hypothetical cystic fibrosis test as an example. Suppose a test for cystic fibrosis has an accuracy of 99%. We will use the following notation:\n\\[\n\\mbox{Prob}(+ \\mid D=1)=0.99, \\mbox{Prob}(- \\mid D=0)=0.99\n\\]\nwith \\(+\\) meaning a positive test and \\(D\\) representing if you actually have the disease (1) or not (0).\nImagine we select a random person and they test positive. What is the probability that they have the disease? We write this as \\(\\mbox{Prob}(D=1 \\mid +)?\\) The cystic fibrosis rate is 1 in 3,900, which implies that \\(\\mbox{Prob}(D=1)=0.00025\\). To answer this question, we will use Bayes theorem, which in general tells us that:\n\\[\n\\mbox{Pr}(A \\mid B)  =  \\frac{\\mbox{Pr}(B \\mid A)\\mbox{Pr}(A)}{\\mbox{Pr}(B)}\n\\]\nThis equation, when applied to our problem, becomes:\n\\[\n\\begin{aligned}\n\\mbox{Pr}(D=1 \\mid +) & =  \\frac{ P(+ \\mid D=1) \\cdot P(D=1)} {\\mbox{Pr}(+)} \\\\\n& =  \\frac{\\mbox{Pr}(+ \\mid D=1)\\cdot P(D=1)} {\\mbox{Pr}(+ \\mid D=1) \\cdot P(D=1) + \\mbox{Pr}(+ \\mid D=0) \\mbox{Pr}( D=0)}\n\\end{aligned}\n\\]\nPlugging in the numbers, we get:\n\\[\n\\frac{0.99 \\cdot 0.00025}{0.99 \\cdot 0.00025 + 0.01 \\cdot (.99975)}  =  0.02\n\\]\nAccording to the above, despite the test having 0.99 accuracy, the probability of having the disease given a positive test is only 0.02. This might seem counter-intuitive to some, but it ss because we must factor in the very rare probability that a randomly chosen person has the disease. To illustrate this, we run a Monte Carlo simulation.\n\n12.1.1 Bayes theorem simulation\nThe following simulation is meant to help you visualize Bayes theorem. We start by randomly selecting 100,000 people from a population in which the disease in question has a 1 in 4,000 prevalence.\n\nprev &lt;- 0.00025\nN &lt;- 100000\noutcome &lt;- sample(c(\"Disease\",\"Healthy\"), N, replace = TRUE, \n                  prob = c(prev, 1 - prev))\n\nNote that there are very few people with the disease:\n\nN_D &lt;- sum(outcome == \"Disease\")\nN_D\n#&gt; [1] 23\nN_H &lt;- sum(outcome == \"Healthy\")\nN_H\n#&gt; [1] 99977\n\nAlso, there are many people without the disease, which makes it more probable that we will see some false positives given that the test is not perfect. Now, each person gets the test, which is correct 99% of the time:\n\naccuracy &lt;- 0.99\ntest &lt;- vector(\"character\", N)\ntest[outcome == \"Disease\"]  &lt;- sample(c(\"+\", \"-\"), N_D, replace = TRUE, \n                                    prob = c(accuracy, 1 - accuracy))\ntest[outcome == \"Healthy\"]  &lt;- sample(c(\"-\", \"+\"), N_H, replace = TRUE, \n                                    prob = c(accuracy, 1 - accuracy))\n\nGiven that there are so many more controls than cases, even with a low false positive rate, we end up with more controls than cases in the group that tested positive:\n\ntable(outcome, test)\n#&gt;          test\n#&gt; outcome       -     +\n#&gt;   Disease     0    23\n#&gt;   Healthy 99012   965\n\nFrom this table, we see that the proportion of positive tests that have the disease is 23 out of 988. We can run this over and over again to see that, in fact, the probability converges to about 0.022."
   },
   {
     "objectID": "inference/bayes.html#priors-posteriors-and-and-credible-intervals",
     "href": "inference/bayes.html#priors-posteriors-and-and-credible-intervals",
-    "title": "11  Bayesian statistics",
-    "section": "\n11.2 Priors, posteriors and and credible intervals",
-    "text": "11.2 Priors, posteriors and and credible intervals\nIn the previous chapter we an estimate and margin of error for the difference in popular votes between Hillary Clinton and Donald Trump, which we denoted with \\(\\mu\\). The estimate was between 2 and 3 percent and the confidence interval did not include 0. A forecaster would use this to predict Hillary Clinton would win the popular vote. But to make a probabilistic statement about winning the election, we need to use a Bayesian.\nWe start the Bayesian approach by quantifying our knowledge before seeing any data. This is done using a probability distribution refereed to as a prior. For our example we could write:\n\\[\n\\mu \\sim N(\\theta, \\tau)\n\\]\nWe can think of \\(\\theta\\) as our best guess for the popular vote difference had we not seen any polling data and we can think of \\(\\tau\\) as quantifying how certain we feel about this guess. Generally, if we have expert knowledge related to \\(\\mu\\), we can try to quantify it with the prior disribution. In the case of election polls, experts use fundamentals, which include, for example, the state of the economy, to develop prior distributions. The data is used to update our initial guess or prior belief. This can be done mathematically if we define the distribution for the observed data, for any given \\(\\mu\\). In our particular example we would write down a model the average of our polls, which is the same as before:\n\\[\n\\bar{X} \\mid \\mu \\sim N(\\mu, \\sigma/\\sqrt{N})\n\\]\nAs before, \\(\\sigma\\) describes randomness due to sampling and the pollster effects. In the Bayesian contexts, this is referred to as the sampling distribution. Note that we write the conditional \\(\\bar{X} \\mid \\mu\\) becuase \\(\\mu\\) is now considered a random variable.\nWe do not show the derivations here, but we can now use Calculus and a version fo Bayes Theorem foto derive the distribution of \\(\\mu\\) conditional of the data, refered to as the posterior distribution. Specifcially we can show the \\(\\mu \\mid \\bar{X}\\) follows a normal distribution with expected value:\n\\[\n\\begin{aligned}\n\\mbox{E}(\\mu \\mid \\bar{X}) &= B \\theta + (1-B) \\bar{X}\\\\\n&= \\theta + (1-B)(\\bar{X}-\\theta)\\\\\n\\mbox{with } B &= \\frac{\\sigma^2/N}{\\sigma^2/N+\\tau^2}\n\\end{aligned}\n\\] and standard error :\n\\[\n\\mbox{SE}(\\mu \\mid \\bar{X})^2 = \\frac{1}{1/\\sigma^2+1/\\tau^2}.\n\\]\nNote that the expected value is a weighted average of our prior guess \\(\\theta\\) and the observed data \\(\\bar{X}\\). The weight depends on how certain we are about our prior belief, quantified by \\(\\tau\\), and the precision \\(\\sigma/N\\) of the summary of our observed data. This weighted average is sometimes referred to as shrinking because it shrinks estimates towards a prior value.\nThese quantities useful for updating our beliefs. Specifically, we use the posterior distribution not only to compute the expected value of \\(\\mu\\) given the observed data, but for any probability \\(\\alpha\\) we can construct intervals, centered at our estimate and with \\(\\alpha\\) chance of ocurring.\nTo compute a posterior distribution and construct a credible interval, we define a prior distribution with mean 0% and standard error 3.5% which can be interpreted as: before seing polling data, we don’t think any candidate has the advantage and a difference of up to 7% either way is possible. We compute the posterior distribution using the equations above:\n\ntheta &lt;- 0\ntau &lt;- 0.035\nsigma &lt;- results$se\nx_bar &lt;- results$avg\nB &lt;- sigma^2 / (sigma^2 + tau^2)\n\nposterior_mean &lt;- B*theta + (1 - B)*x_bar\nposterior_se &lt;- sqrt(1/(1/sigma^2 + 1/tau^2))\n\nposterior_mean\n#&gt; [1] 0.0281\nposterior_se\n#&gt; [1] 0.00615\n\nBecause we know the posterior distribution in normal, we can consturct a credible interval like this:\n\nposterior_mean + c(-1, 1) * qnorm(0.975) * posterior_se\n#&gt; [1] 0.0160 0.0401\n\nFurthermore, we can now make the probabilitic statement we could not make with the frequentists approach by computing the posterior probability of Hillary winning the popular vote. Specifically, \\(\\mbox{Pr}(\\mu&gt;0 \\mid \\bar{X})\\) can be computed like this:\n\n1 - pnorm(0, posterior_mean, posterior_se)\n#&gt; [1] 1\n\nThis says we are 100% sure Clinton will win the popular vote, which seems too overconfident. Also, it is not in agreement with FiveThirtyEight’s 81.4%. What explains this difference? There is a level of uncertainty that we are not yet describing, and we will get back to that in Chapter Chapter 12."
+    "title": "12  Bayesian statistics",
+    "section": "\n12.2 Priors, posteriors and and credible intervals",
+    "text": "12.2 Priors, posteriors and and credible intervals\nIn the previous chapter, we computed an estimate and margin of error for the difference in popular votes between Hillary Clinton and Donald Trump. We denoted the parameter, the the difference in popular votes, with \\(\\mu\\). The estimate was between 2 and 3 percent, and the confidence interval did not include 0. A forecaster would use this to predict Hillary Clinton would win the popular vote. But to make a probabilistic statement about winning the election, we need to use a Bayesian approach.\nWe start the Bayesian approach by quantifying our knowledge before seeing any data. This is done using a probability distribution referred to as a prior. For our example, we could write:\n\\[\n\\mu \\sim N(\\theta, \\tau)\n\\]\nWe can think of \\(\\theta\\) as our best guess for the popular vote difference had we not seen any polling data, and we can think of \\(\\tau\\) as quantifying how certain we feel about this guess. Generally, if we have expert knowledge related to \\(\\mu\\), we can try to quantify it with the prior distribution. In the case of election polls, experts use fundamentals, which include, for example, the state of the economy, to develop prior distributions.\nThe data is used to update our initial guess or prior belief. This can be done mathematically if we define the distribution for the observed data for any given \\(\\mu\\). In our particular example, we would write down a model the average of our polls, which is the same as before:\n\\[\n\\bar{X} \\mid \\mu \\sim N(\\mu, \\sigma/\\sqrt{N})\n\\]\nAs before, \\(\\sigma\\) describes randomness due to sampling and the pollster effects. In the Bayesian contexts, this is referred to as the sampling distribution. Note that we write the conditional \\(\\bar{X} \\mid \\mu\\) because \\(\\mu\\) is now considered a random variable.\nWe do not show the derivations here, but we can now use calculus and a version of Bayes’ Theorem to derive the distribution of \\(\\mu\\) conditional of the data, referred to as the posterior distribution. Specifically, we can show the \\(\\mu \\mid \\bar{X}\\) follows a normal distribution with expected value:\n\\[\n\\begin{aligned}\n\\mbox{E}(\\mu \\mid \\bar{X}) &= B \\theta + (1-B) \\bar{X}\\\\\n&= \\theta + (1-B)(\\bar{X}-\\theta)\\\\\n\\mbox{with } B &= \\frac{\\sigma^2/N}{\\sigma^2/N+\\tau^2}\n\\end{aligned}\n\\] and standard error :\n\\[\n\\mbox{SE}(\\mu \\mid \\bar{X})^2 = \\frac{1}{1/\\sigma^2+1/\\tau^2}.\n\\]\nNote that the expected value is a weighted average of our prior guess \\(\\theta\\) and the observed data \\(\\bar{X}\\). The weight depends on how certain we are about our prior belief, quantified by \\(\\tau\\), and the precision \\(\\sigma/N\\) of the summary of our observed data. This weighted average is sometimes referred to as shrinking because it shrinks estimates towards a prior value.\nThese quantities are useful for updating our beliefs. Specifically, we use the posterior distribution not only to compute the expected value of \\(\\mu\\) given the observed data, but also, for any probability \\(\\alpha\\), we can construct intervals centered at our estimate and with \\(\\alpha\\) chance of occurring.\nTo compute a posterior distribution and construct a credible interval, we define a prior distribution with mean 0% and standard error 3.5%, which can be interpreted as follows: before seeing polling data, we don’t think any candidate has the advantage, and a difference of up to 7% either way is possible. We compute the posterior distribution using the equations above:\n\ntheta &lt;- 0\ntau &lt;- 0.035\nsigma &lt;- results$se\nx_bar &lt;- results$avg\nB &lt;- sigma^2 / (sigma^2 + tau^2)\n\nposterior_mean &lt;- B*theta + (1 - B)*x_bar\nposterior_se &lt;- sqrt(1/(1/sigma^2 + 1/tau^2))\n\nposterior_mean\n#&gt; [1] 0.0281\nposterior_se\n#&gt; [1] 0.00615\n\nSince we know the posterior distribution is normal, we can construct a credible interval like this:\n\nposterior_mean + c(-1, 1) * qnorm(0.975) * posterior_se\n#&gt; [1] 0.0160 0.0401\n\nFurthermore, we can now make the probabilistic statement that we could not make with the frequentists approach by computing the posterior probability of Hillary winning the popular vote. Specifically, \\(\\mbox{Pr}(\\mu&gt;0 \\mid \\bar{X})\\) can be computed as follows:\n\n1 - pnorm(0, posterior_mean, posterior_se)\n#&gt; [1] 1\n\nAccording to the above, we are 100% sure Clinton will win the popular vote, which seems too overconfident. Additionally, it is not in agreement with FiveThirtyEight’s 81.4%. What explains this difference? There is a level of uncertainty that we are not yet describing, and we will return to that in Chapter 13."
   },
   {
     "objectID": "inference/bayes.html#exercises",
     "href": "inference/bayes.html#exercises",
-    "title": "11  Bayesian statistics",
-    "section": "\n11.3 Exercises",
-    "text": "11.3 Exercises\n1. In 1999, in England, Sally Clark1 was found guilty of the murder of two of her sons. Both infants were found dead in the morning, one in 1996 and another in 1998. In both cases, she claimed the cause of death was sudden infant death syndrome (SIDS). No evidence of physical harm was found on the two infants so the main piece of evidence against her was the testimony of Professor Sir Roy Meadow, who testified that the chances of two infants dying of SIDS was 1 in 73 million. He arrived at this figure by finding that the rate of SIDS was 1 in 8,500 and then calculating that the chance of two SIDS cases was 8,500 \\(\\times\\) 8,500 \\(\\approx\\) 73 million. Which of the following do you agree with?\n\nSir Meadow assumed that the probability of the second son being affected by SIDS was independent of the first son being affected, thereby ignoring possible genetic causes. If genetics plays a role then: \\(\\mbox{Pr}(\\mbox{second case of SIDS} \\mid \\mbox{first case of SIDS}) &gt; \\mbox{P}r(\\mbox{first case of SIDS})\\).\nNothing. The multiplication rule always applies in this way: \\(\\mbox{Pr}(A \\mbox{ and } B) =\\mbox{Pr}(A)\\mbox{Pr}(B)\\)\n\nSir Meadow is an expert and we should trust his calculations.\nNumbers don’t lie.\n\n2. Let’s assume that there is in fact a genetic component to SIDS and the probability of \\(\\mbox{Pr}(\\mbox{second case of SIDS} \\mid \\mbox{first case of SIDS}) = 1/100\\), is much higher than 1 in 8,500. What is the probability of both of her sons dying of SIDS?\n3. Many press reports stated that the expert claimed the probability of Sally Clark being innocent as 1 in 73 million. Perhaps the jury and judge also interpreted the testimony this way. This probability can be written as the probability of a mother is a son-murdering psychopath given that two of her children are found dead with no evidence of physical harm. According to Bayes’ rule, what is this?\n4. Assume that the chance of a son-murdering psychopath finding a way to kill her children, without leaving evidence of physical harm, is:\n\\[\n\\mbox{Pr}(A \\mid B) = 0.50\n\\]\nwith A = two of her children are found dead with no evidence of physical harm and B = a mother is a son-murdering psychopath = 0.50. Assume that the rate of son-murdering psychopaths mothers is 1 in 1,000,000. According to Bayes’ theorem, what is the probability of \\(\\mbox{Pr}(B \\mid A)\\) ?\n5/. After Sally Clark was found guilty, the Royal Statistical Society issued a statement saying that there was “no statistical basis” for the expert’s claim. They expressed concern at the “misuse of statistics in the courts”. Eventually, Sally Clark was acquitted in June 2003. What did the expert miss?\n\nHe made an arithmetic error.\nHe made two mistakes. First, he misused the multiplication rule and did not take into account how rare it is for a mother to murder her children. After using Bayes’ rule, we found a probability closer to 0.5 than 1 in 73 million.\nHe mixed up the numerator and denominator of Bayes’ rule.\nHe did not use R.\n\n6. Florida is one of the most closely watched states in the U.S. election because it has many electoral votes, and the election is generally close, and Florida tends to be a swing state that can vote either way. Create the following table with the polls taken during the last two weeks:\n\nlibrary(tidyverse)\nlibrary(dslabs)\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(state == \"Florida\" & enddate &gt;= \"2016-11-04\" ) |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)\n\nTake the average spread of these polls. The CLT tells us this average is approximately normal. Calculate an average and provide an estimate of the standard error. Save your results in an object called results.\n7. Now assume a Bayesian model that sets the prior distribution for Florida’s election night spread \\(\\mu\\) to be Normal with expected value \\(\\theta\\) and standard deviation \\(\\tau\\). What are the interpretations of \\(\\theta\\) and \\(\\tau\\)?\n\n\n\\(\\theta\\) and \\(\\tau\\) are arbitrary numbers that let us make probability statements about \\(\\mu\\).\n\n\\(\\theta\\) and \\(\\tau\\) summarize what we would predict for Florida before seeing any polls. Based on past elections, we would set \\(\\mu\\) close to 0 because both Republicans and Democrats have won, and \\(\\tau\\) at about \\(0.02\\), because these elections tend to be close.\n\n\\(\\theta\\) and \\(\\tau\\) summarize what we want to be true. We therefore set \\(\\theta\\) at \\(0.10\\) and \\(\\tau\\) at \\(0.01\\).\nThe choice of prior has no effect on Bayesian analysis.\n\n8. The CLT tells us that our estimate of the spread \\(\\hat{\\mu}\\) has normal distribution with expected value \\(\\mu\\) and standard deviation \\(\\sigma\\) calculated in problem 6. Use the formulas we showed for the posterior distribution to calculate the expected value of the posterior distribution if we set \\(\\theta = 0\\) and \\(\\tau = 0.01\\).\n9. Now compute the standard deviation of the posterior distribution.\n10. Using the fact that the posterior distribution is normal, create an interval that has a 95% probability of occurring centered at the posterior expected value. Note that we call these credible intervals.\n11. According to this analysis, what was the probability that Trump wins Florida?\n12. Now use sapply function to change the prior variance from seq(0.005, 0.05, len = 100) and observe how the probability changes by making a plot."
+    "title": "12  Bayesian statistics",
+    "section": "\n12.3 Exercises",
+    "text": "12.3 Exercises\n1. In 1999, in England, Sally Clark1 was found guilty of the murder of two of her sons. Both infants were found dead in the morning, one in 1996 and another in 1998. In both cases, she claimed the cause of death was sudden infant death syndrome (SIDS). No evidence of physical harm was found on the two infants, so the main piece of evidence against her was the testimony of Professor Sir Roy Meadow, who testified that the chances of two infants dying of SIDS was 1 in 73 million. He arrived at this figure by finding that the rate of SIDS was 1 in 8,500, and then calculating that the chance of two SIDS cases was 8,500 \\(\\times\\) 8,500 \\(\\approx\\) 73 million. Which of the following do you agree with?\n\nSir Meadow assumed that the probability of the second son being affected by SIDS was independent of the first son being affected, thereby ignoring possible genetic causes. If genetics plays a role then: \\(\\mbox{Pr}(\\mbox{second case of SIDS} \\mid \\mbox{first case of SIDS}) &gt; \\mbox{P}r(\\mbox{first case of SIDS})\\).\nNothing. The multiplication rule always applies in this way: \\(\\mbox{Pr}(A \\mbox{ and } B) =\\mbox{Pr}(A)\\mbox{Pr}(B)\\)\n\nSir Meadow is an expert and we should trust his calculations.\nNumbers don’t lie.\n\n2. Let’s assume that there is, in fact, a genetic component to SIDS and the probability of \\(\\mbox{Pr}(\\mbox{second case of SIDS} \\mid \\mbox{first case of SIDS}) = 1/100\\), is much higher than 1 in 8,500. What is the probability of both of her sons dying of SIDS?\n3. Many press reports stated that the expert claimed the probability of Sally Clark being innocent was 1 in 73 million. Perhaps the jury and judge also interpreted the testimony this way. This probability can be written as the probability of a mother is a son-murdering psychopath given that two of her children are found dead with no evidence of physical harm. According to Bayes’ rule, what is this?\n4. Assume that the chance of a son-murdering psychopath finding a way to kill her children, without leaving evidence of physical harm, is:\n\\[\n\\mbox{Pr}(A \\mid B) = 0.50\n\\]\nwith A = two of her children are found dead with no evidence of physical harm, and B = a mother is a son-murdering psychopath = 0.50. Assume that the rate of son-murdering psychopaths mothers is 1 in 1,000,000. According to Bayes’ Theorem, what is the probability of \\(\\mbox{Pr}(B \\mid A)\\) ?\n5/. After Sally Clark was found guilty, the Royal Statistical Society issued a statement saying that there was “no statistical basis” for the expert’s claim. They expressed concern at the “misuse of statistics in the courts”. Eventually, Sally Clark was acquitted in June 2003. What did the expert miss?\n\nHe made an arithmetic error.\nHe made two mistakes. First, he misused the multiplication rule and did not take into account how rare it is for a mother to murder her children. After using Bayes’ rule, we found a probability closer to 0.5 than 1 in 73 million.\nHe mixed up the numerator and denominator of Bayes’ rule.\nHe did not use R.\n\n6. Florida is one of the most closely watched states in U.S. elections because it has many electoral votes. In past elections, Florida was a swing state where both Republicans and Democrats won implying it could affect a close elections.\nCreate the following table with the polls taken during the last two weeks:\n\nlibrary(tidyverse)\nlibrary(dslabs)\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(state == \"Florida\" & enddate &gt;= \"2016-11-04\" ) |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)\n\nTake the average spread of these polls. The CLT tells us this average is approximately normal. Calculate an average and provide an estimate of the standard error. Save your results in an object called results.\n7. Now assume a Bayesian model that sets the prior distribution for Florida’s election night spread \\(\\mu\\) to follow a normal distribution with expected value \\(\\theta\\) and standard deviation \\(\\tau\\). What are the interpretations of \\(\\theta\\) and \\(\\tau\\)?\n\n\n\\(\\theta\\) and \\(\\tau\\) are arbitrary numbers that let us make probability statements about \\(\\mu\\).\n\n\\(\\theta\\) and \\(\\tau\\) summarize what we would predict for Florida before seeing any polls. Based on past elections, we would set \\(\\mu\\) close to 0, because both Republicans and Democrats have won, and \\(\\tau\\) at about \\(0.02\\), because these elections tend to be close.\n\n\\(\\theta\\) and \\(\\tau\\) summarize what we want to be true. We therefore set \\(\\theta\\) at \\(0.10\\) and \\(\\tau\\) at \\(0.01\\).\nThe choice of prior has no effect on Bayesian analysis.\n\n8. The CLT tells us that our estimate of the spread \\(\\hat{\\mu}\\) has normal distribution with expected value \\(\\mu\\) and standard deviation \\(\\sigma\\) calculated in exercise 6. Use the formulas we provided for the posterior distribution to calculate the expected value of the posterior distribution if we set \\(\\theta = 0\\) and \\(\\tau = 0.01\\).\n9. Now compute the standard deviation of the posterior distribution.\n10. Using the fact that the posterior distribution is normal, create an interval that has a 95% probability of occurring centered at the posterior expected value. Note that we call these credible intervals.\n11. According to this analysis, what was the probability that Trump wins Florida?\n12. Now use sapply function to change the prior variance from seq(0.005, 0.05, len = 100) and observe how the probability changes by making a plot."
   },
   {
     "objectID": "inference/bayes.html#footnotes",
     "href": "inference/bayes.html#footnotes",
-    "title": "11  Bayesian statistics",
+    "title": "12  Bayesian statistics",
     "section": "",
     "text": "https://en.wikipedia.org/wiki/Sally_Clark↩︎"
   },
   {
     "objectID": "inference/hierarchical-models.html#case-study-election-forecasting",
     "href": "inference/hierarchical-models.html#case-study-election-forecasting",
-    "title": "12  Hierarchichal Models",
-    "section": "\n12.1 Case study: election forecasting",
-    "text": "12.1 Case study: election forecasting\nSince the 2008 elections, organizations other than FiveThirtyEight have started their own election forecasting groups that also aggregate polling data and uses statistical models to make predictions. However, in 2016 forecasters underestimated Trump’s chances of winning greatly. The day before the election the New York Times reported1 the following probabilities for Hillary Clinton winning the presidency:\n\n\n\n\n\nNYT\n538\nHuffPost\nPW\nPEC\nDK\nCook\nRoth\n\n\nWin Prob\n85%\n71%\n98%\n89%\n&gt;99%\n92%\nLean Dem\nLean Dem\n\n\n\n\nFor example, the Princeton Election Consortium (PEC) gave Trump less than 1% chance of winning, while the Huffington Post gave him a 2% chance. In contrast, FiveThirtyEight had Trump’s probability of winning at 29%, substantially higher than the others. In fact, four days before the election FiveThirtyEight published an article titled Trump Is Just A Normal Polling Error Behind Clinton2.\nSo why did FiveThirtyEight’s model fair so much better than others? How could PEC and Huffington Post get it so wrong if they were using the same data? In this chapter we describe how FiveThirtyEight used a hierarchical model to correctly account for key sources of variability and outperform all other forecasters. For illustrative purposes we will cotinue examining our popular vote example. In the final section we then describe the more complex approach used to forecast the electoral college result."
+    "title": "13  Hierarchichal Models",
+    "section": "\n13.1 Case study: election forecasting",
+    "text": "13.1 Case study: election forecasting\nSince the 2008 elections, organizations other than FiveThirtyEight have started their own election forecasting groups that also aggregate polling data and uses statistical models to make predictions. However, in 2016, forecasters underestimated Trump’s chances of winning greatly. The day before the election, the New York Times reported1 the following probabilities for Hillary Clinton winning the presidency:\n\n\n\n\n\nNYT\n538\nHuffPost\nPW\nPEC\nDK\nCook\nRoth\n\n\nWin Prob\n85%\n71%\n98%\n89%\n&gt;99%\n92%\nLean Dem\nLean Dem\n\n\n\n\nMeanwhile, the Princeton Election Consortium (PEC) gave Trump less than 1% chance of winning, while the Huffington Post gave him a 2% chance. In contrast, FiveThirtyEight had Trump’s probability of winning at 29%, substantially higher than the others. In fact, four days before the election, FiveThirtyEight published an article titled Trump Is Just A Normal Polling Error Behind Clinton2.\nSo why did FiveThirtyEight’s model fair so much better than others? How could PEC and Huffington Post get it so wrong if they were using the same data? In this chapter, we describe how FiveThirtyEight used a hierarchical model to correctly account for key sources of variability and outperform all other forecasters. For illustrative purposes, we will continue examining our popular vote example. In the final section, we will describe the more complex approach used to forecast the electoral college result."
   },
   {
-    "objectID": "inference/hierarchical-models.html#the-general-bias",
-    "href": "inference/hierarchical-models.html#the-general-bias",
-    "title": "12  Hierarchichal Models",
-    "section": "\n12.2 The general bias",
-    "text": "12.2 The general bias\nIn the previous chapter we computed the posterior probability of Hillary Clinton winning the popular vote with a standard Bayesian analysis and found it to be very close to 100%. However, FiveThirtyEight gave her a 81.4% chance3. What explains this difference? Below we describe the general bias, another source of variability, included in the FiveThirtyEight model, that accounts for the difference.\nAfter elections are over, one can look at the difference between pollster predictions and actual result. An important observation that our initial models did not take into account is that it is common to see a general bias that affects most pollsters in the same way making the observed data correlated. There is no agreed upon explanation for this, but we do observe it in historical data: in one election, the average of polls favors Democrats by 2%, then in the following election they favor Republicans by 1%, then in the next election there is no bias, then in the following one Republicans are favored by 3%, and so on. In 2016, the polls were biased in favor of the Democrats by 1-2%.\nHowever, although we know this bias term affects our polls, we have no way of knowing what this bias is until election night. So we can’t correct our polls accordingly. What we can do is include a term in our model that accounts for the variability."
+    "objectID": "inference/hierarchical-models.html#sec-general-bias",
+    "href": "inference/hierarchical-models.html#sec-general-bias",
+    "title": "13  Hierarchichal Models",
+    "section": "\n13.2 The general bias",
+    "text": "13.2 The general bias\nIn the previous chapter, we computed the posterior probability of Hillary Clinton winning the popular vote with a standard Bayesian analysis and found it to be very close to 100%. However, FiveThirtyEight gave her a 81.4% chance3. What explains this difference? Below, we describe the general bias, another source of variability, included in the FiveThirtyEight model, that accounts for the difference.\nAfter elections are over, one can look at the difference between the pollster predictions and the actual result. An important observation, that our initial models did not take into account, is that it is common to see a general bias that affects most pollsters in the same way, making the observed data correlated. There is no agreed upon explanation for this, but we do observe it in historical data: in one election, the average of polls favors Democrats by 2%; then in the following election, they favor Republicans by 1%; then in the next election there is no bias; then in the following one Republicans are favored by 3%, and so on. In 2016, the polls were biased in favor of the Democrats by 1-2%.\nAlthough we know this bias term affects our polls, we have no way of knowing what this bias is until election night. So we can’t correct our polls accordingly. What we can do is include a term in our model that accounts for the variability."
   },
   {
     "objectID": "inference/hierarchical-models.html#mathematical-representations-of-the-hierarchical-model",
     "href": "inference/hierarchical-models.html#mathematical-representations-of-the-hierarchical-model",
-    "title": "12  Hierarchichal Models",
-    "section": "\n12.3 Mathematical representations of the hierarchical model",
-    "text": "12.3 Mathematical representations of the hierarchical model\nSuppose we are collecting data from one pollster and we assume there is no general bias. The pollster collects several polls with a sample size of \\(N\\), so we observe several measurements of the spread \\(X_1, \\dots, X_J\\). Suppose the real proportion for Hillary is \\(p\\) and the difference is \\(\\mu\\). The urn model theory tells us that these random variables are normally distributed with expected value \\(\\mu\\) and standard error \\(2 \\sqrt{p(1-p)/N}\\):\n\\[\nX_j \\sim \\mbox{N}\\left(\\mu, \\sqrt{p(1-p)/N}\\right)\n\\]\nWe use the index \\(j\\) to represent the different polls conducted by this pollster. Here is a simulation for six polls assuming the spread is 2.1 and \\(N\\) is 2,000:\n\nset.seed(3)\nJ &lt;- 6\nN &lt;- 2000\nmu &lt;- .021\np &lt;- (mu + 1)/2\nX &lt;- rnorm(J, mu, 2 * sqrt(p * (1 - p) / N))\n\nNow suppose we have \\(J=6\\) polls from each of \\(I=5\\) different pollsters. For simplicity, let’s say all polls had the same sample size \\(N\\). The urn model tell us the distribution is the same for all pollsters so to simulate data, we use the same model for each:\n\nI &lt;- 5\nJ &lt;- 6\nN &lt;- 2000\nX &lt;- sapply(1:I, function(i){\n  rnorm(J, mu, 2 * sqrt(p * (1 - p) / N))\n})\n\nAs expected, the simulated data does not really seem to capture the features of the actual data because it does not account for pollster-to-pollster variability:\n\n\n\n\n\n\n\n\nTo fix this, we need to represent the two levels of variability and we need two indexes, one for pollster and one for the polls each pollster takes. We use \\(X_{ij}\\) with \\(i\\) representing the pollster and \\(j\\) representing the \\(j\\)-th poll from that pollster. The model is now augmented to include pollster effects \\(h_i\\), referred to as house effects by FiveThirtyEight, with standard deviation \\(\\sigma_h\\):\n\\[\n\\begin{aligned}\nh_i &\\sim \\mbox{N}\\left(0, \\sigma_h\\right)\\\\\nX_{i,j} \\mid h_i &\\sim \\mbox{N}\\left(\\mu + h_i, \\sqrt{p(1-p)/N}\\right)\n\\end{aligned}\n\\]\nTo simulate data from a specific pollster, we first need to draw an \\(h_i\\) and the generate individual poll data after adding this effect. Here is how we would do it for one specific pollster. We assume \\(\\sigma_h\\) is 0.025:\n\nI &lt;- 5\nJ &lt;- 6\nN &lt;- 2000\nmu &lt;- .021\np &lt;- (mu + 1) / 2\nh &lt;- rnorm(I, 0, 0.025)\nX &lt;- sapply(1:I, function(i){\n  mu + h[i] + rnorm(J, 0, 2 * sqrt(p * (1 - p) / N))\n})\n\nThe simulated data now looks more like the actual data:\n\n\n\n\n\n\n\n\nNote that \\(h_i\\) is common to all the observed spreads from a specific pollster. Different pollsters have a different \\(h_i\\), which explains why we can see the groups of points shift up and down from pollster to pollster.\nNow, in the model above, we assume the average house effect is 0. We think that for every pollster biased in favor of our party, there is another one in favor of the other and assume the standard deviation is \\(\\sigma_h\\). But historically we see that every election has a general bias affecting all polls. We can observe this with the 2016 data, but if we collect historical data, we see that the average of polls misses by more than models like the one above predict. To see this, we would take the average of polls for each election year and compare it to the actual value. If we did this, we would see a difference with a standard deviation of between 2-3%. To incorporate this into the model, we can add another level account for this variability:\n\\[\n\\begin{aligned}\nb &\\sim \\mbox{N}\\left(0, \\sigma_b\\right)\\\\\nh_j \\mid \\, b &\\sim \\mbox{N}\\left(b, \\sigma_h\\right)\\\\\nX_{i,j} | \\, h_j, b &\\sim \\mbox{N}\\left(\\mu + h_j, \\sqrt{p(1-p)/N}\\right)\n\\end{aligned}\n\\]\nThis model accounts for three levels of variability: 1) variability in the bias observed from election to election, quantified by \\(\\sigma_b\\), 2) pollster-to-pollster or house effect variability, quantified by \\(\\sigma_h\\), and 3) poll sampling variability, which we can derive to be \\(\\sqrt(p(1-p)/N)\\).\nNote that not including a term like \\(b\\) in the models, is what led many forecasters to be overconfident. This random variable changes from election to election, but for any given election, it is the same for all pollsters and polls within on election (note it does not have an index). This implies we can’t estimate \\(\\sigma_h\\) with data from just one election. It also implies that the random variables \\(X_{i,j}\\) for a fixed election year share the same \\(b\\) and are therefore correlated.\nOne way to interpret \\(b\\) is as the difference between the average of all polls from all pollsters and the actual result of the election. Because we don’t know the actual result until after the election, we can’t estimate \\(b\\) until after the election."
+    "title": "13  Hierarchichal Models",
+    "section": "\n13.3 Mathematical representations of the hierarchical model",
+    "text": "13.3 Mathematical representations of the hierarchical model\nSuppose we are collecting data from one pollster and we assume there is no general bias. The pollster collects several polls with a sample size of \\(N\\), so we observe several measurements of the spread \\(X_1, \\dots, X_J\\). Suppose the real proportion for Hillary is \\(p\\) and the difference is \\(\\mu\\). The urn model theory tells us that these random variables are normally distributed, with expected value \\(\\mu\\) and standard error \\(2 \\sqrt{p(1-p)/N}\\):\n\\[\nX_j \\sim \\mbox{N}\\left(\\mu, \\sqrt{p(1-p)/N}\\right)\n\\]\nWe use the index \\(j\\) to represent the different polls conducted by this pollster. Below is a simulation for six polls assuming the spread is 2.1 and \\(N\\) is 2,000:\n\nset.seed(3)\nJ &lt;- 6\nN &lt;- 2000\nmu &lt;- .021\np &lt;- (mu + 1)/2\nX &lt;- rnorm(J, mu, 2*sqrt(p*(1 - p)/N))\n\nNow, suppose we have \\(J=6\\) polls from each of \\(I=5\\) different pollsters. For simplicity, let’s say all polls had the same sample size \\(N\\). The urn model tell us the distribution is the same for all pollsters, so to simulate data, we use the same model for each:\n\nI &lt;- 5\nJ &lt;- 6\nN &lt;- 2000\nX &lt;- sapply(1:I, function(i){\n  rnorm(J, mu, 2*sqrt(p*(1 - p)/N))\n})\n\nAs expected, the simulated data does not really seem to capture the features of the actual data because it does not account for pollster-to-pollster variability:\n\n\n\n\n\n\n\n\nTo fix this, we need to represent the two levels of variability and we need two indexes, one for pollster and one for the polls each pollster takes. We use \\(X_{ij}\\) with \\(i\\) representing the pollster and \\(j\\) representing the \\(j\\)-th poll from that pollster. The model is now augmented to include pollster effects \\(h_i\\), referred to as “house effects” by FiveThirtyEight, with standard deviation \\(\\sigma_h\\):\n\\[\n\\begin{aligned}\nh_i &\\sim \\mbox{N}\\left(0, \\sigma_h\\right)\\\\\nX_{i,j} \\mid h_i &\\sim \\mbox{N}\\left(\\mu + h_i, \\sqrt{p(1-p)/N}\\right)\n\\end{aligned}\n\\]\nTo simulate data from a specific pollster, we first need to draw an \\(h_i\\), and then generate individual poll data after adding this effect. Here is how we would do it for one specific pollster. We assume \\(\\sigma_h\\) is 0.025:\n\nI &lt;- 5\nJ &lt;- 6\nN &lt;- 2000\nmu &lt;- .021\np &lt;- (mu + 1)/2\nh &lt;- rnorm(I, 0, 0.025)\nX &lt;- sapply(1:I, function(i){\n  mu + h[i] + rnorm(J, 0, 2*sqrt(p*(1 - p)/N))\n})\n\nThe simulated data now looks more like the actual data:\n\n\n\n\n\n\n\n\nNote that \\(h_i\\) is common to all the observed spreads from a specific pollster. Different pollsters have a different \\(h_i\\), which explains why we can see the groups of points shift up and down from pollster to pollster.\nNow, in the model above, we assume the average house effect is 0. We think that for every pollster biased in favor of our party, there is another one in favor of the other, and assume the standard deviation is \\(\\sigma_h\\). But, historically, we see that every election has a general bias affecting all polls. We can observe this with the 2016 data, but if we collect historical data, we see that the average of polls misses by more than models like the one above predict. To see this, we would take the average of polls for each election year and compare it to the actual value. If we did this, we would see a difference with a standard deviation of between 2-3%. To account for this variability we can add another level to the model as follows:\n\\[\n\\begin{aligned}\nb &\\sim \\mbox{N}\\left(0, \\sigma_b\\right)\\\\\nh_j \\mid \\, b &\\sim \\mbox{N}\\left(b, \\sigma_h\\right)\\\\\nX_{i,j} | \\, h_j, b &\\sim \\mbox{N}\\left(\\mu + h_j, \\sqrt{p(1-p)/N}\\right)\n\\end{aligned}\n\\]\nThis model accounts for three levels of variability: 1) variability in the bias observed from election to election, quantified by \\(\\sigma_b\\), 2) pollster-to-pollster or house effect variability, quantified by \\(\\sigma_h\\), and 3) poll sampling variability, which we can derive to be \\(\\sqrt(p(1-p)/N)\\).\nNote that not including a term like \\(b\\) in the models is what led many forecasters to be overconfident. This random variable changes from election to election, but for any given election, it is the same for all pollsters and polls within one election (note it does not have an index). This implies that we can’t estimate \\(\\sigma_h\\) with data from just one election. It also implies that the random variables \\(X_{i,j}\\) for a fixed election year share the same \\(b\\) and are therefore correlated.\nOne way to interpret \\(b\\) is as the difference between the average of all polls from all pollsters and the actual result of the election. Since we don’t know the actual result until after the election, we can’t estimate \\(b\\) until then."
   },
   {
     "objectID": "inference/hierarchical-models.html#computing-a-posterior-probability",
     "href": "inference/hierarchical-models.html#computing-a-posterior-probability",
-    "title": "12  Hierarchichal Models",
-    "section": "\n12.4 Computing a posterior probability",
-    "text": "12.4 Computing a posterior probability\n\n\n\n\n\n\nSome of the results presented in this section rely on calculations of the statistical properties of summaries based on correlated random variables. The learn about the related mathematical details we skip in this book, please consult a textbook on hierarchical models.\n\n\n\nNow let’s fit the model above to data. We will use the same data used in the previous chapters and saved in one_poll_per_pollster.\n\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(state == \"U.S.\" & enddate &gt;= \"2016-10-31\" &\n           (grade %in% c(\"A+\",\"A\",\"A-\",\"B+\") | is.na(grade))) |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)\n\none_poll_per_pollster &lt;- polls |&gt; group_by(pollster) |&gt; \n  filter(enddate == max(enddate)) |&gt;\n  ungroup()\n\nHere we have just one poll per pollster so we will drop the \\(j\\) index and represent the data as before with \\(X_1, \\dots, X_I\\). As a reminder we have data from \\(I=15\\) pollsters. Based on the model assumptions described above, we can mathematically show that the average \\(\\bar{X}\\)\n\nx_bar &lt;- mean(one_poll_per_pollster$spread)\n\nhas expected value \\(\\mu\\), thus it provides an unbiased estimate of the outcome of interest. However, how precise is this estimate? Can we use the observed stample standard deviation to construct an estimate of the standard error of \\(\\bar{X}\\)?\nIt turns out that, because the \\(X_i\\) are correlated, estimating the standard error is more complex than what we have described up to now. Specifically, using advanced statistical calculations not shown here, we can show that the typical variance (standard error squared) estimate\n\ns2 &lt;- with(one_poll_per_pollster, sd(spread)^2 / length(spread))\n\nwill consistently underestimate the true standard error by about \\(\\sigma_b^2\\). And, as mentioned earlier, to estimate \\(\\sigma_b\\), we need data from several elections. By collecting and analyzing polling data from several elections, FiveThirtyEight estimates this variability and find that \\(\\sigma_b \\approx 0.025\\). We can therefore greatly improve our standard error estimate by adding this quantity:\n\nsigma_b &lt;- 0.025\nse &lt;- sqrt(s2 + sigma_b^2)\n\nIf we redo the Bayesian calculation taking this variability into account, we get a result much closer to FiveThirtyEight’s:\n\nmu &lt;- 0\ntau &lt;- 0.035\nB &lt;- se^2 / (se^2 + tau^2)\nposterior_mean &lt;- B*mu + (1-B)*x_bar\nposterior_se &lt;- sqrt( 1/ (1/se^2 + 1/tau^2))\n\n1 - pnorm(0, posterior_mean, posterior_se)\n#&gt; [1] 0.817\n\nNote that by accounting for the general bias term, our Bayesian analysis now produces a posterior probability similar to that reported by FiveThirtyEight.\n\n\n\n\n\n\nNote that we are simplifying FiveThirtyEight’s calculations related to the general bias \\(b\\). For example, one of the many ways their analysis is more complex than the one presented here, is that they permit \\(b\\) vary across regions of the country. This helps because historically, we have observed geographical patterns in voting behaviors."
+    "title": "13  Hierarchichal Models",
+    "section": "\n13.4 Computing a posterior probability",
+    "text": "13.4 Computing a posterior probability\n\n\n\n\n\n\nSome of the results presented in this section rely on calculations of the statistical properties of summaries based on correlated random variables. To learn about the related mathematical details we skip in this book, please consult a textbook on hierarchical models.\n\n\n\nNow, let’s fit the model above to data. We will use the same data used in the previous chapters and saved in one_poll_per_pollster.\n\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(state == \"U.S.\" & enddate &gt;= \"2016-10-31\" &\n           (grade %in% c(\"A+\",\"A\",\"A-\",\"B+\") | is.na(grade))) |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)\n\none_poll_per_pollster &lt;- polls |&gt; group_by(pollster) |&gt; \n  filter(enddate == max(enddate)) |&gt;\n  ungroup()\n\nHere, we have just one poll per pollster, so we will drop the \\(j\\) index and represent the data as before with \\(X_1, \\dots, X_I\\). As a reminder, we have data from \\(I=15\\) pollsters. Based on the model assumptions described above, we can mathematically show that the average \\(\\bar{X}\\):\n\nx_bar &lt;- mean(one_poll_per_pollster$spread)\n\nhas expected value \\(\\mu\\); thus, it provides an unbiased estimate of the outcome of interest. But how precise is this estimate? Can we use the observed sample standard deviation to construct an estimate of the standard error of \\(\\bar{X}\\)?\nIt turns out that, because the \\(X_i\\) are correlated, estimating the standard error is more complex than what we have described up to now. Specifically, using advanced statistical calculations not shown here, we can show that the typical variance (standard error squared) estimate:\n\ns2 &lt;- with(one_poll_per_pollster, sd(spread)^2/length(spread))\n\nwill consistently underestimate the true standard error by about \\(\\sigma_b^2\\). And, as mentioned earlier, to estimate \\(\\sigma_b\\), we need data from several elections. By collecting and analyzing polling data from several elections, FiveThirtyEight estimates this variability and finds that \\(\\sigma_b \\approx 0.025\\). We can therefore greatly improve our standard error estimate by adding this quantity:\n\nsigma_b &lt;- 0.025\nse &lt;- sqrt(s2 + sigma_b^2)\n\nIf we redo the Bayesian calculation taking this variability into account, we obtain a result much closer to FiveThirtyEight’s:\n\nmu &lt;- 0\ntau &lt;- 0.035\nB &lt;- se^2/(se^2 + tau^2)\nposterior_mean &lt;- B*mu + (1 - B)*x_bar\nposterior_se &lt;- sqrt(1/(1/se^2 + 1/tau^2))\n\n1 - pnorm(0, posterior_mean, posterior_se)\n#&gt; [1] 0.817\n\nNotice that by accounting for the general bias term, our Bayesian analysis now produces a posterior probability similar to that reported by FiveThirtyEight.\n\n\n\n\n\n\nKeep in mind that we are simplifying FiveThirtyEight’s calculations related to the general bias \\(b\\). For example, one of the many ways their analysis is more complex than the one presented here is that FiveThirtyEight permits \\(b\\) to vary across regions of the country. This helps because, historically, we have observed geographical patterns in voting behaviors."
   },
   {
     "objectID": "inference/hierarchical-models.html#predicting-the-electoral-college",
     "href": "inference/hierarchical-models.html#predicting-the-electoral-college",
-    "title": "12  Hierarchichal Models",
-    "section": "\n12.5 Predicting the electoral college",
-    "text": "12.5 Predicting the electoral college\nUp to now we have focused on the popular vote. But in the United States, elections are not decided by the popular vote but rather by what is known as the electoral college. Each state gets a number of electoral votes that depends, in a somewhat complex way, on the population size of the state. Here are the top 5 states ranked by electoral votes in 2016.\n\nresults_us_election_2016 |&gt; top_n(5, electoral_votes)\n#&gt;          state electoral_votes clinton trump others\n#&gt; 1   California              55    61.7  31.6    6.7\n#&gt; 2        Texas              38    43.2  52.2    4.5\n#&gt; 3      Florida              29    47.8  49.0    3.2\n#&gt; 4     New York              29    59.0  36.5    4.5\n#&gt; 5     Illinois              20    55.8  38.8    5.4\n#&gt; 6 Pennsylvania              20    47.9  48.6    3.6\n\nWith some minor exceptions we don’t discuss, the electoral votes are won all or nothing. For example, if you won California in 2016 by just 1 vote, you still get all 55 of its electoral votes. This means that by winning a few big states by a large margin, but losing many small states by small margins, you can win the popular vote and yet lose the electoral college. This happened in 1876, 1888, 2000, and 2016. The idea behind this is to avoid a few large states having the power to dominate the presidential election.\n\nMany people in the US consider the electoral college unfair and would like to see it abolished in favor of the popular vote.\n\nWe are now ready to predict the electoral college result for 2016. We start by aggregating results from a poll taken during the last week before the election. We use the grepl, which finds strings in character vectors, to remove polls that are not for entire states.\n\nresults &lt;- polls_us_election_2016 |&gt;\n  filter(state!=\"U.S.\" & \n           !grepl(\"CD\", state) & \n           enddate &gt;=\"2016-10-31\" & \n           (grade %in% c(\"A+\",\"A\",\"A-\",\"B+\") | is.na(grade))) |&gt;\n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100) |&gt;\n  group_by(state) |&gt;\n  summarize(avg = mean(spread), sd = sd(spread), n = n()) |&gt;\n  mutate(state = as.character(state))\n\nHere are the five closest races according to the polls:\n\nresults |&gt; arrange(abs(avg))\n#&gt; # A tibble: 47 × 4\n#&gt;   state               avg     sd     n\n#&gt;   &lt;chr&gt;             &lt;dbl&gt;  &lt;dbl&gt; &lt;int&gt;\n#&gt; 1 Florida         0.00356 0.0163     7\n#&gt; 2 North Carolina -0.0073  0.0306     9\n#&gt; 3 Ohio           -0.0104  0.0252     6\n#&gt; 4 Nevada          0.0169  0.0441     7\n#&gt; 5 Iowa           -0.0197  0.0437     3\n#&gt; # ℹ 42 more rows\n\nWe now introduce the command left_join that will let us easily add the number of electoral votes for each state from the dataset us_electoral_votes_2016. Here, we simply say that the function combines the two datasets so that the information from the second argument is added to the information in the first:\n\nresults &lt;- left_join(results, results_us_election_2016, by = \"state\")\n\nNotice that some states have no polls because the winner is pretty much known:\n\nresults_us_election_2016 |&gt; filter(!state %in% results$state) |&gt; \n  pull(state)\n#&gt; [1] \"Rhode Island\"         \"Alaska\"               \"Wyoming\"             \n#&gt; [4] \"District of Columbia\"\n\nNo polls were conducted in DC, Rhode Island, Alaska, and Wyoming because Democrats are sure to win in the first two and Republicans in the last two.\nBecause we can’t estimate the standard deviation for states with just one poll, we will estimate it as the median of the standard deviations estimated for states with more than one poll:\n\nresults &lt;- results |&gt;\n  mutate(sd = ifelse(is.na(sd), median(results$sd, na.rm = TRUE), sd))\n\nTo make probabilistic arguments, we will use a Monte Carlo simulation. For each state, we apply the Bayesian approach to generate an election day \\(\\mu\\). We could construct the priors for each state based on recent history. However, to keep it simple, we assign a prior to each state that assumes we know nothing about what will happen. Since from election year to election year the results from a specific state don’t change that much, we will assign a standard deviation of 2% or \\(\\tau=0.02\\). For now, we will assume, incorrectly, that the poll results from each state are independent. The code for the Bayesian calculation under these assumptions looks like this:\n\nmu &lt;- 0\ntau &lt;- 0.02\nresults |&gt; mutate(sigma = sd/sqrt(n), \n                   B = sigma^2 / (sigma^2 + tau^2),\n                   posterior_mean = B * mu + (1 - B) * avg,\n                   posterior_se = sqrt(1/ (1/sigma^2 + 1/tau^2)))\n#&gt; # A tibble: 47 × 12\n#&gt;   state          avg       sd     n electoral_votes clinton trump others\n#&gt;   &lt;chr&gt;        &lt;dbl&gt;    &lt;dbl&gt; &lt;int&gt;           &lt;int&gt;   &lt;dbl&gt; &lt;dbl&gt;  &lt;dbl&gt;\n#&gt; 1 Alabama    -0.149  0.0253       3               9    34.4  62.1    3.6\n#&gt; 2 Arizona    -0.0326 0.0270       9              11    45.1  48.7    6.2\n#&gt; 3 Arkansas   -0.151  0.000990     2               6    33.7  60.6    5.8\n#&gt; 4 California  0.260  0.0387       5              55    61.7  31.6    6.7\n#&gt; 5 Colorado    0.0452 0.0295       7               9    48.2  43.3    8.6\n#&gt; # ℹ 42 more rows\n#&gt; # ℹ 4 more variables: sigma &lt;dbl&gt;, B &lt;dbl&gt;, posterior_mean &lt;dbl&gt;,\n#&gt; #   posterior_se &lt;dbl&gt;\n\nThe estimates based on posterior do move the estimates towards 0, although the states with many polls are influenced less. This is expected as the more poll data we collect, the more we trust those results:\n\n\n\n\n\n\n\n\nNow we repeat this 10,000 times and generate an outcome from the posterior. In each iteration, we keep track of the total number of electoral votes for Clinton. Remember that Trump gets 270 minus the votes for Clinton. Also note that the reason we add 7 in the code is to account for Rhode Island and D.C.:\n\nB &lt;- 10000\nmu &lt;- 0\ntau &lt;- 0.02\nclinton_EV &lt;- replicate(B, {\n  results |&gt; mutate(sigma = sd/sqrt(n), \n                   B = sigma^2 / (sigma^2 + tau^2),\n                   posterior_mean = B * mu + (1 - B) * avg,\n                   posterior_se = sqrt(1 / (1/sigma^2 + 1/tau^2)),\n                   result = rnorm(length(posterior_mean), \n                                  posterior_mean, posterior_se),\n                   clinton = ifelse(result &gt; 0, electoral_votes, 0)) |&gt; \n    summarize(clinton = sum(clinton)) |&gt; \n    pull(clinton) + 7\n})\n\nmean(clinton_EV &gt; 269)\n#&gt; [1] 0.998\n\nThis model gives Clinton over 99% chance of winning. A similar prediction was made by the Princeton Election Consortium. We now know it was quite off. What happened?\nThe model above ignores the general bias and assumes the results from different states are independent. After the election, we realized that the general bias in 2016 was not that big: it was between 1 and 2%. But because the election was close in several big states and these states had a large number of polls, pollsters that ignored the general bias greatly underestimated the standard error. Using the notation we introduce, they assumed the standard error was \\(\\sqrt{\\sigma^2/N}\\) which with large N is quite smaller than the more accurate estimate \\(\\sqrt{\\sigma^2/N + \\sigma_b^2}\\). FiveThirtyEight, which models the general bias in a rather sophisticated way, reported a closer result. We can simulate the results now with a bias term. For the state level, the general bias can be larger so we set it at \\(\\sigma_b = 0.03\\):\n\ntau &lt;- 0.02\nbias_sd &lt;- 0.03\nclinton_EV_2 &lt;- replicate(1000, {\n  results |&gt; mutate(sigma = sqrt(sd^2/n  + bias_sd^2),  \n                   B = sigma^2 / (sigma^2 + tau^2),\n                   posterior_mean = B*mu + (1-B)*avg,\n                   posterior_se = sqrt( 1/ (1/sigma^2 + 1/tau^2)),\n                   result = rnorm(length(posterior_mean), \n                                  posterior_mean, posterior_se),\n                   clinton = ifelse(result&gt;0, electoral_votes, 0)) |&gt; \n    summarize(clinton = sum(clinton) + 7) |&gt; \n    pull(clinton)\n})\nmean(clinton_EV_2 &gt; 269)\n#&gt; [1] 0.848\n\nThis gives us a much more sensible estimate. Looking at the outcomes of the simulation, we see how the bias term adds variability to the final results.\n\n\n\n\n\n\n\n\nFiveThirtyEight includes many other features we do not include here. One is that they model variability with distributions that have high probabilities for extreme events compared to the normal. One way we could do this is by changing the distribution used in the simulation from a normal distribution to a t-distribution. FiveThirtyEight predicted a probability of 71%."
+    "title": "13  Hierarchichal Models",
+    "section": "\n13.5 Predicting the electoral college",
+    "text": "13.5 Predicting the electoral college\nUp to now, we have focused on the popular vote. However, in the United States, elections are not decided by the popular vote but rather by what is known as the electoral college. Each state gets a number of electoral votes that depends, in a somewhat complex way, on the population size of the state. Here are the top 5 states ranked by electoral votes in 2016:\n\nresults_us_election_2016 |&gt; top_n(5, electoral_votes)\n#&gt;          state electoral_votes clinton trump others\n#&gt; 1   California              55    61.7  31.6    6.7\n#&gt; 2        Texas              38    43.2  52.2    4.5\n#&gt; 3      Florida              29    47.8  49.0    3.2\n#&gt; 4     New York              29    59.0  36.5    4.5\n#&gt; 5     Illinois              20    55.8  38.8    5.4\n#&gt; 6 Pennsylvania              20    47.9  48.6    3.6\n\nWith some minor exceptions we won’t discuss, the electoral votes are won on an all-or-nothing basis. For example, if you won California in 2016 by just 1 vote, you still get all 55 of its electoral votes. This means that by winning a few big states by a large margin, but losing many small states by small margins, you can win the popular vote and yet lose the electoral college. This happened in 1876, 1888, 2000, and 2016. The idea behind this is to prevent a few large states from having the power to dominate the presidential election.\n\nMany people in the US consider the electoral college unfair and would like to see it abolished in favor of the popular vote.\n\nWe are now ready to predict the electoral college result for 2016. We start by aggregating results from a poll taken during the last week before the election. We use the grepl, which finds strings in character vectors, to remove polls that are not for entire states.\n\nresults &lt;- polls_us_election_2016 |&gt;\n  filter(state != \"U.S.\" & \n           !grepl(\"CD\", state) & \n           enddate &gt;= \"2016-10-31\" & \n           (grade %in% c(\"A+\",\"A\",\"A-\",\"B+\") | is.na(grade))) |&gt;\n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100) |&gt;\n  group_by(state) |&gt;\n  summarize(avg = mean(spread), sd = sd(spread), n = n()) |&gt;\n  mutate(state = as.character(state))\n\nHere are the five closest races according to the polls:\n\nresults |&gt; arrange(abs(avg))\n#&gt; # A tibble: 47 × 4\n#&gt;   state               avg     sd     n\n#&gt;   &lt;chr&gt;             &lt;dbl&gt;  &lt;dbl&gt; &lt;int&gt;\n#&gt; 1 Florida         0.00356 0.0163     7\n#&gt; 2 North Carolina -0.0073  0.0306     9\n#&gt; 3 Ohio           -0.0104  0.0252     6\n#&gt; 4 Nevada          0.0169  0.0441     7\n#&gt; 5 Iowa           -0.0197  0.0437     3\n#&gt; # ℹ 42 more rows\n\nWe now introduce the command left_join that will let us easily add the number of electoral votes for each state from the dataset us_electoral_votes_2016. Here, we simply say that the function combines the two datasets so that the information from the second argument is added to the information in the first:\n\nresults &lt;- left_join(results, results_us_election_2016, by = \"state\")\n\nNotice that some states have no polls because the winner is pretty much known:\n\nresults_us_election_2016 |&gt; filter(!state %in% results$state) |&gt; \n  pull(state)\n#&gt; [1] \"Rhode Island\"         \"Alaska\"               \"Wyoming\"             \n#&gt; [4] \"District of Columbia\"\n\nNo polls were conducted in DC, Rhode Island, Alaska, and Wyoming because Democrats are sure to win in the first two and Republicans in the last two.\nBecause we can’t estimate the standard deviation for states with just one poll, we will estimate it as the median of the standard deviations estimated for states with more than one poll:\n\nresults &lt;- results |&gt;\n  mutate(sd = ifelse(is.na(sd), median(results$sd, na.rm = TRUE), sd))\n\nTo make probabilistic arguments, we will use a Monte Carlo simulation. For each state, we apply the Bayesian approach to generate an election day \\(\\mu\\). We could construct the priors for each state based on recent history. However, to keep it simple, we assign a prior to each state that assumes we know nothing about what will happen. Given that results from a specific state don’t vary significantly from election year to election year, we will assign a standard deviation of 2% or \\(\\tau=0.02\\). For now, we will assume, incorrectly, that the poll results from each state are independent. The code for the Bayesian calculation under these assumptions looks like this:\n\nmu &lt;- 0\ntau &lt;- 0.02\nresults |&gt; mutate(sigma = sd/sqrt(n), \n                   B = sigma^2/(sigma^2 + tau^2),\n                   posterior_mean = B*mu + (1 - B)*avg,\n                   posterior_se = sqrt(1/(1/sigma^2 + 1/tau^2)))\n#&gt; # A tibble: 47 × 12\n#&gt;   state          avg       sd     n electoral_votes clinton trump others\n#&gt;   &lt;chr&gt;        &lt;dbl&gt;    &lt;dbl&gt; &lt;int&gt;           &lt;int&gt;   &lt;dbl&gt; &lt;dbl&gt;  &lt;dbl&gt;\n#&gt; 1 Alabama    -0.149  0.0253       3               9    34.4  62.1    3.6\n#&gt; 2 Arizona    -0.0326 0.0270       9              11    45.1  48.7    6.2\n#&gt; 3 Arkansas   -0.151  0.000990     2               6    33.7  60.6    5.8\n#&gt; 4 California  0.260  0.0387       5              55    61.7  31.6    6.7\n#&gt; 5 Colorado    0.0452 0.0295       7               9    48.2  43.3    8.6\n#&gt; # ℹ 42 more rows\n#&gt; # ℹ 4 more variables: sigma &lt;dbl&gt;, B &lt;dbl&gt;, posterior_mean &lt;dbl&gt;,\n#&gt; #   posterior_se &lt;dbl&gt;\n\nThe estimates based on posterior do move the estimates towards 0, although the states with many polls are influenced less. This is expected as the more poll data we collect, the more we trust those results:\n\n\n\n\n\n\n\n\nNow, we repeat this 10,000 times and generate an outcome from the posterior. In each iteration, we track the total number of electoral votes for Clinton. Remember that Trump gets 270 votes minus the ones for Clinton. Also, note that the reason we add 7 in the code is to account for Rhode Island and D.C.:\n\nB &lt;- 10000\nmu &lt;- 0\ntau &lt;- 0.02\nclinton_EV &lt;- replicate(B, {\n  results |&gt; mutate(sigma = sd/sqrt(n), \n                   B = sigma^2 / (sigma^2 + tau^2),\n                   posterior_mean = B*mu + (1 - B)*avg,\n                   posterior_se = sqrt(1/(1/sigma^2 + 1/tau^2)),\n                   result = rnorm(length(posterior_mean), \n                                  posterior_mean, posterior_se),\n                   clinton = ifelse(result &gt; 0, electoral_votes, 0)) |&gt; \n    summarize(clinton = sum(clinton)) |&gt; \n    pull(clinton) + 7\n})\n\nmean(clinton_EV &gt; 269)\n#&gt; [1] 0.998\n\nThis model gives Clinton over 99% chance of winning. A similar prediction was made by the Princeton Election Consortium. We now know it was quite off. What happened?\nThe model above ignores the general bias and assumes the results from different states are independent. After the election, we realized that the general bias in 2016 was not that big: it was between 1 and 2%. But because the election was close in several big states and these states had a large number of polls, pollsters that ignored the general bias greatly underestimated the standard error. Using the notation we introduced, they assumed the standard error was \\(\\sqrt{\\sigma^2/N}\\). With large \\(N\\), this estimate is substiantially closer to 0 than the more accurate estimate \\(\\sqrt{\\sigma^2/N + \\sigma_b^2}\\).\nFiveThirtyEight, which models the general bias in a rather sophisticated way, reported a closer result. We can simulate the results now with a bias term. For the state level, the general bias can be larger so we set it at \\(\\sigma_b = 0.03\\):\n\ntau &lt;- 0.02\nbias_sd &lt;- 0.03\nclinton_EV_2 &lt;- replicate(1000, {\n  results |&gt; mutate(sigma = sqrt(sd^2/n  + bias_sd^2),  \n                   B = sigma^2/(sigma^2 + tau^2),\n                   posterior_mean = B*mu + (1 - B)*avg,\n                   posterior_se = sqrt(1/(1/sigma^2 + 1/tau^2)),\n                   result = rnorm(length(posterior_mean), \n                                  posterior_mean, posterior_se),\n                   clinton = ifelse(result &gt; 0, electoral_votes, 0)) |&gt; \n    summarize(clinton = sum(clinton) + 7) |&gt; \n    pull(clinton)\n})\nmean(clinton_EV_2 &gt; 269)\n#&gt; [1] 0.848\n\nThis gives us a much more sensible estimate. Looking at the outcomes of the simulation, we see how the bias term adds variability to the final results.\n\n\n\n\n\n\n\n\nFiveThirtyEight includes many other features we do not include here. One is that they model variability with distributions that have high probabilities for extreme events compared to the normal. One way we could do this is by changing the distribution used in the simulation from a normal distribution to a t-distribution. FiveThirtyEight predicted a probability of 71%."
   },
   {
     "objectID": "inference/hierarchical-models.html#forecasting",
     "href": "inference/hierarchical-models.html#forecasting",
-    "title": "12  Hierarchichal Models",
-    "section": "\n12.6 Forecasting",
-    "text": "12.6 Forecasting\nForecasters like to make predictions well before the election. The predictions are adapted as new polls come out. However, an important question forecasters must ask is: how informative are polls taken several weeks before the election about the actual election? Here we study the variability of poll results across time.\nTo make sure the variability we observe is not due to pollster effects, let’s study data from one pollster:\n\none_pollster &lt;- polls_us_election_2016 |&gt; \n  filter(pollster == \"Ipsos\" & state == \"U.S.\") |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)\n\nSince there is no pollster effect, then perhaps the theoretical standard error matches the data-derived standard deviation. We compute both here:\n\nse &lt;- one_pollster |&gt; \n  summarize(empirical = sd(spread), \n            theoretical = 2 * sqrt(mean(spread) * (1 - mean(spread)) /\n                                     min(samplesize)))\nse\n#&gt;   empirical theoretical\n#&gt; 1    0.0403      0.0326\n\nBut the empirical standard deviation is higher than the highest possible theoretical estimate. Furthermore, the spread data does not look normal as the theory would predict:\n\n\n\n\n\n\n\n\nThe models we have described include pollster-to-pollster variability and sampling error. But this plot is for one pollster and the variability we see is certainly not explained by sampling error. Where is the extra variability coming from? The following plots make a strong case that it comes from time fluctuations not accounted for by the theory that assumes \\(p\\) is fixed:\n\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nSome of the peaks and valleys we see coincide with events such as the party conventions, which tend to give the candidate a boost. We can see the peaks and valleys are consistent across several pollsters:\n\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nThis implies that, if we are going to forecast, our model must include a term to accounts for the time effect. We need to write a model including a bias term for time, denote it \\(b_t\\). The standard deviation of \\(b_t\\) would depend on \\(t\\) since the closer we get to election day, the closer to 0 this bias term should be.\nPollsters also try to estimate trends from these data and incorporate these into their predictions. We can model the time trend \\(b_t\\) with a smooth function. We usually see the trend estimte not for the difference, but for the actual percentages for each candidate like this:\n\n\n\n\n\n\n\n\nOnce a model like the one above is selected, we can use historical and present data to estimate all the necessary parameters to make predictions. There is a variety of methods for estimating trends which we discuss in the Machine Learning part."
+    "title": "13  Hierarchichal Models",
+    "section": "\n13.6 Forecasting",
+    "text": "13.6 Forecasting\nForecasters like to make predictions well before the election. The predictions are adapted as new poll results are released. However, an important question forecasters must ask is: How informative are polls taken several weeks before the election about the actual election? Here, we study the variability of poll results across time.\nTo make sure the variability we observe is not due to pollster effects, let’s study data from one pollster:\n\none_pollster &lt;- polls_us_election_2016 |&gt; \n  filter(pollster == \"Ipsos\" & state == \"U.S.\") |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)\n\nSince there is no pollster effect, then perhaps the theoretical standard error matches the data-derived standard deviation. We compute both here:\n\nse &lt;- one_pollster |&gt; \n  summarize(empirical = sd(spread), \n            theoretical = 2*sqrt(mean(spread)*(1 - mean(spread))/min(samplesize)))\nse\n#&gt;   empirical theoretical\n#&gt; 1    0.0403      0.0326\n\nBut the empirical standard deviation is higher than the highest possible theoretical estimate. Furthermore, the spread data does not look normal as the theory would predict:\n\n\n\n\n\n\n\n\nThe models we have described include pollster-to-pollster variability and sampling error. But this plot is for one pollster and the variability we see is certainly not explained by sampling error. Where is the extra variability coming from? The following plots make a strong case that it comes from time fluctuations not accounted for by the theory that assumes \\(p\\) is fixed:\n\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nSome of the peaks and valleys we see coincide with events such as the party conventions, which tend to give the candidate a boost. We can see that the peaks and valleys are consistent across several pollsters:\n\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nThis implies that if we are going to forecast, our model must include a term to accounts for the time effect. We need to write a model including a bias term for time, denoted as \\(b_t\\). The standard deviation of \\(b_t\\) would depend on \\(t\\) since the closer we get to election day, the closer to 0 this bias term should be.\nPollsters also try to estimate trends from these data and incorporate them into their predictions. We can model the time trend \\(b_t\\) with a smooth function. We usually see the trend estimate not for the difference, but for the actual percentages for each candidate like this:\n\n\n\n\n\n\n\n\nOnce a model like the one above is selected, we can use historical and present data to estimate all the necessary parameters to make predictions. There is a variety of methods for estimating trends which we discuss in the section on Machine Learning."
   },
   {
     "objectID": "inference/hierarchical-models.html#exercises",
     "href": "inference/hierarchical-models.html#exercises",
-    "title": "12  Hierarchichal Models",
-    "section": "\n12.7 Exercises",
-    "text": "12.7 Exercises\n1. Create this table:\n\nlibrary(tidyverse)\nlibrary(dslabs)\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(state != \"U.S.\" & enddate &gt;= \"2016-10-31\") |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)\n\nNow for each poll use the CLT to create a 95% confidence interval for the spread reported by each poll. Call the resulting object cis with columns lower and upper for the limits of the confidence intervals. Use the select function to keep the columns state, startdate, end date, pollster, grade, spread, lower, upper.\n2. You can add the final result to the cis table you just created using the right_join function like this:\n\nadd &lt;- results_us_election_2016 |&gt; \n  mutate(actual_spread = clinton/100 - trump/100) |&gt; \n  select(state, actual_spread)\ncis &lt;- cis |&gt; \n  mutate(state = as.character(state)) |&gt; \n  left_join(add, by = \"state\")\n\nNow determine how often the 95% confidence interval includes the actual result.\n3. Repeat this, but show the proportion of hits for each pollster. Show only pollsters with more than 5 polls and order them from best to worst. Show the number of polls conducted by each pollster and the FiveThirtyEight grade of each pollster. Hint: use n=n(), grade = grade[1] in the call to summarize.\n4. Repeat exercise 3, but instead of pollster, stratify by state. Note that here we can’t show grades.\n5. Make a barplot based on the result of exercise 4. Use coord_flip.\n6. Add two columns to the cis table by computing, for each poll, the difference between the predicted spread and the actual spread, and define a column hit that is true if the signs are the same. Hint: use the function sign. Call the object resids.\n7. Create a plot like in exercise 5, but for the proportion of times the sign of the spread agreed.\n8. In exercise 7, we see that for most states the polls had it right 100% of the time. For only 9 states did the polls miss more than 25% of the time. In particular, notice that in Wisconsin every single poll got it wrong. In Pennsylvania and Michigan more than 90% of the polls had the signs wrong. Make a histogram of the errors. What is the median of these errors?\n9. We see that at the state level, the median error was 3% in favor of Clinton. The distribution is not centered at 0, but at 0.03. This is the general bias we described in the section above. Create a boxplot to see if the bias was general to all states or it affected some states differently. Use filter(grade %in% c(\"A+\",\"A\",\"A-\",\"B+\") | is.na(grade))) to only include pollsters with high grades.\n10. Some of these states only have a few polls. Repeat exercise 9, but only include states with 5 good polls or more. Hint: use group_by, filter then ungroup. You will see that the West (Washington, New Mexico, California) underestimated Hillary’s performance, while the Midwest (Michigan, Pennsylvania, Wisconsin, Ohio, Missouri) overestimated it. In our simulation, we did not model this behavior since we added general bias, rather than a regional bias. Note that some pollsters may now be modeling correlation between similar states and estimating this correlation from historical data. To learn more about this, you can learn about random effects and mixed models.\n11. In April 2013, José Iglesias, a professional baseball player was starting his career. He was performing exceptionally well. Specifically, he had a batting average (AVG) of .450. The batting average statistic is one way of measuring success. Roughly speaking, it tells us the success rate when batting. José had 9 successes out of 20 tries. An AVG of .450 means José has been successful 45% of the times he has batted which is rather high, historically speaking: no one has finished a season with an AVG of .400 or more since Ted Williams did it in 1941! We want to predict José’s batting average at the end of the season after players have about 500 tries or at bats. With the frequentist techniques we have no choice but to predict that his AVG will be .450 at the end of the season. Compute a confidence interval for the success rate.\n12. Despite the frequentist prediction of \\(.450\\) not a single baseball enthusiast would make this prediction. Why is this? One reason is that they now the estimate has much uncertainty. However, the main reason is that they are implicitly using a hierarchical model that factors in information from years of following baseball. Use the following code to explore the distribution of batting averages in the three seasons prior to 2013 and describe what this tells us.\n13. So is José lucky or is he the best batter seen in the last 50 years? Perhaps it’s a combination of both luck and talent. But how much of each? If we become convinced that he is lucky, we should trade him to a team that trusts the .450 observation and is maybe overestimating his potential. The hierarchical model provides a mathematical description of how we came to see the observation of .450. First, we pick a player at random with an intrinsic ability summarized by, for example, \\(\\mu\\). Then we see 20 random outcomes with success probability \\(\\mu\\). What model would you use for the first level of your hierarchical model?\n\nDescribe the second level of the hierarchical model.\n\n15. Apply the hierarchical model to José’s data. Suppose we want to predict his innate ability in the form of his true batting average \\(\\mu\\). Write down the distributions of the hierarchical model.\n16. We now are ready to compute a the distribution of \\(\\mu\\) conditioned on the observed data \\(\\bar{X}\\). Compute the expected value of \\(\\mu\\) given the current average \\(\\bar{X}\\) and provide an intuitive explanation for the mathematical formula.\n17. We started with a frequentist 95% confidence interval that ignored data from other players and summarized just José’s data: .450 \\(\\pm\\) 0.220. Construct a credible interval for \\(\\mu\\) based on the hierarchical model.\n18. The credible interval suggests that if another team is impressed by the .450 observation, we should consider trading José as we are predicting he will be just slightly above average. Interestingly, the Red Sox traded José to the Detroit Tigers in July. Here are the José Iglesias batting averages for the next five months:\n\n\nMonth\nAt Bat\nHits\nAVG\n\n\n\nApril\n20\n9\n.450\n\n\nMay\n26\n11\n.423\n\n\nJune\n86\n34\n.395\n\n\nJuly\n83\n17\n.205\n\n\nAugust\n85\n25\n.294\n\n\nSeptember\n50\n10\n.200\n\n\nTotal w/o April\n330\n97\n.293\n\n\n\nWhich of the two approaches provided a better prediciton?"
+    "title": "13  Hierarchichal Models",
+    "section": "\n13.7 Exercises",
+    "text": "13.7 Exercises\n1. Create this table:\n\nlibrary(tidyverse)\nlibrary(dslabs)\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(state != \"U.S.\" & enddate &gt;= \"2016-10-31\") |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)\n\nNow, for each poll, use the CLT to create a 95% confidence interval for the spread reported by each poll. Call the resulting object cis with columns lower and upper for the limits of the confidence intervals. Use the select function to keep the columns state, startdate, end date, pollster, grade, spread, lower, upper.\n2. You can add the final result to the cis table you just created using the right_join function like this:\n\nadd &lt;- results_us_election_2016 |&gt; \n  mutate(actual_spread = clinton/100 - trump/100) |&gt; \n  select(state, actual_spread)\ncis &lt;- cis |&gt; \n  mutate(state = as.character(state)) |&gt; \n  left_join(add, by = \"state\")\n\nNow, determine how often the 95% confidence interval includes the election night result stored in actual_spread.\n3. Repeat this, but show the proportion of hits for each pollster. Consider only pollsters with more than 5 polls and order them from best to worst. Show the number of polls conducted by each pollster and the FiveThirtyEight grade of each pollster. Hint: Use n=n(), grade = grade[1] in the call to summarize.\n4. Repeat exercise 3, but instead of pollster, stratify by state. Note that here we can’t show grades.\n5. Make a barplot based on the result of exercise 4. Use coord_flip.\n6. Add two columns to the cis table by computing, for each poll, the difference between the predicted spread and the actual spread, and define a column hit that is true if the signs are the same. Hint: Use the function sign. Call the object resids.\n7. Create a plot like in exercise 5, but for the proportion of times the sign of the spread agreed with the election night result.\n8. In exercise 7, we see that for most states the polls had it right 100% of the time. For only 9 states did the polls miss more than 25% of the time. In particular, notice that in Wisconsin every single poll got it wrong. In Pennsylvania and Michigan, more than 90% of the polls had the signs wrong. Make a histogram of the errors. What is the median of these errors?\n9. We see that at the state level, the median error was 3% in favor of Clinton. The distribution is not centered at 0, but at 0.03. This related to the general bias described in Section 13.2. Create a boxplot to see if the bias was general to all states or it affected some states differently. Use filter(grade %in% c(\"A+\",\"A\",\"A-\",\"B+\") | is.na(grade))) to only include pollsters with high grades.\n10. Some of these states only have a few polls. Repeat exercise 9, but only include states with 5 good polls or more. Hint: Use group_by, filter then ungroup. You will see that the West (Washington, New Mexico, California) underestimated Hillary’s performance, while the Midwest (Michigan, Pennsylvania, Wisconsin, Ohio, Missouri) overestimated it. In our simulation, we did not model this behavior since we added general bias, rather than a regional bias. Note that some pollsters may now be modeling correlation between similar states and estimating this correlation from historical data. To learn more about this, you can explore concepts related to random effects and mixed models.\n11. In April 2013, José Iglesias, a professional baseball player was starting his career. He was performing exceptionally well, with an excellent batting average (AVG) of .450. The batting average statistic is one way of measuring success. Roughly speaking, it tells us the success rate when batting. José had 9 successes out of 20 tries. An AVG of .450 means José has been successful 45% of the times he has batted, which is rather high historically speaking. In fact, no one has finished a season with an AVG of .400 or more since Ted Williams did it in 1941! We want to predict José’s batting average at the end of the season after players have had about 500 tries or at bats. With the frequentist techniques, we have no choice but to predict that his AVG will be .450 at the end of the season. Compute a confidence interval for the success rate.\n12. Despite the frequentist prediction of \\(.450\\), not a single baseball enthusiast would make this prediction. Why is this? One reason is that they know the estimate has much uncertainty. However, the main reason is that they are implicitly using a hierarchical model that factors in information from years of following baseball. Use the following code to explore the distribution of batting averages in the three seasons prior to 2013, and describe what this tells us.\n13. So is José lucky or is he the best batter seen in the last 50 years? Perhaps it’s a combination of both luck and talent. But how much of each? If we become convinced that he is lucky, we should trade him to a team that trusts the .450 observation and is maybe overestimating his potential. The hierarchical model provides a mathematical description of how we came to see the observation of .450. First, we pick a player at random with an intrinsic ability summarized by, for example, \\(\\mu\\). Then, we see 20 random outcomes with success probability \\(\\mu\\). What model would you use for the first level of your hierarchical model?\n14. Describe the second level of the hierarchical model.\n15. Apply the hierarchical model to José’s data. Suppose we want to predict his innate ability in the form of his true batting average \\(\\mu\\). Write down the distributions of the hierarchical model.\n16. We now are ready to compute a the distribution of \\(\\mu\\) conditioned on the observed data \\(\\bar{X}\\). Compute the expected value of \\(\\mu\\) given the current average \\(\\bar{X}\\), and provide an intuitive explanation for the mathematical formula.\n17. We started with a frequentist 95% confidence interval that ignored data from other players and summarized just José’s data: .450 \\(\\pm\\) 0.220. Construct a credible interval for \\(\\mu\\) based on the hierarchical model.\n18. The credible interval suggests that if another team is impressed by the .450 observation, we should consider trading José, as we are predicting he will be just slightly above average. Interestingly, the Red Sox traded José to the Detroit Tigers in July. Here are José Iglesias’ batting averages for the next five months:\n\n\nMonth\nAt Bat\nHits\nAVG\n\n\n\nApril\n20\n9\n.450\n\n\nMay\n26\n11\n.423\n\n\nJune\n86\n34\n.395\n\n\nJuly\n83\n17\n.205\n\n\nAugust\n85\n25\n.294\n\n\nSeptember\n50\n10\n.200\n\n\nTotal w/o April\n330\n97\n.293\n\n\n\nWhich of the two approaches provided a better prediction?"
   },
   {
     "objectID": "inference/hierarchical-models.html#footnotes",
     "href": "inference/hierarchical-models.html#footnotes",
-    "title": "12  Hierarchichal Models",
+    "title": "13  Hierarchichal Models",
     "section": "",
     "text": "https://www.nytimes.com/interactive/2016/upshot/presidential-polls-forecast.html↩︎\nhttps://fivethirtyeight.com/features/trump-is-just-a-normal-polling-error-behind-clinton/↩︎\nhttps://projects.fivethirtyeight.com/2016-election-forecast/↩︎"
   },
@@ -592,313 +613,334 @@
     "href": "linear-models/intro-to-linear-models.html",
     "title": "Linear Models",
     "section": "",
-    "text": "Up to this point, this book has focused mainly on datasets consisting of a single variable. However, in data analyses challenges, it is very common to be interested in the relationship between two or more variables. In this part of the book we introduce linear models, a general framework that unifies approaches used for analyzing association between variables, such as simple and multivariate regression, treatment effect models, and association test. We will illustrate these using case studies related to understudying if height is hereditary, described in detail in Chapter Chapter 13, using data to build a baseball team on a budget, described in detail in Chapter Chapter 14, determining if a high-fat diet makes mice heavier, described in detail in Chapter Chapter 16, and examining if their is gender bias in research funding in the Netherlands, described in detail in Chapter Chapter 17."
+    "text": "Up to this point, this book has focused mainly on datasets consisting of a single variable. However, in data analyses challenges, it is very common to be interested in the relationship between two or more variables. In this part of the book we introduce linear models, a general framework that unifies approaches used for analyzing association between variables, such as simple and multivariate regression, treatment effect models, and association test. We will illustrate these using case studies related to understudying if height is hereditary, described in detail in Chapter Chapter 14, using data to build a baseball team on a budget, described in detail in Chapter Chapter 15, determining if a high-fat diet makes mice heavier, described in detail in Chapter Chapter 17, and examining if their is gender bias in research funding in the Netherlands, described in detail in Chapter Chapter 18."
   },
   {
     "objectID": "linear-models/regression.html#case-study-is-height-hereditary",
     "href": "linear-models/regression.html#case-study-is-height-hereditary",
-    "title": "13  Regression",
-    "section": "\n13.1 Case study: is height hereditary?",
-    "text": "13.1 Case study: is height hereditary?\nTo understand the concepts of correlation and simple regression we actually use the dataset from which regression was born. The example is from genetics. Francis Galton1 studied the variation and heredity of human traits. Among many other traits, Galton collected and studied height data from families to try to understand heredity. While doing this, he developed the concepts of correlation and regression, as well as a connection to pairs of data that follow a normal distribution. Of course, at the time this data was collected our knowledge of genetics was quite limited compared to what we know today. A very specific question Galton tried to answer was: how well can we predict a child’s height based on the parents’ height? The technique he developed to answer this question, regression, can also be applied to our baseball question. Regression can be applied in many other circumstances as well.\n\n\n\n\n\n\nGalton made important contributions to statistics and genetics, but he was also one of the first proponents of eugenics, a scientifically flawed philosophical movement favored by many biologists of Galton’s time but with horrific historical consequences. You can read more about it here: https://pged.org/history-eugenics-and-genetics/.\n\n\n\nWe have access to Galton’s family height data through the HistData package. This data contains heights on several dozen families: mothers, fathers, daughters, and sons. To imitate Galton’s analysis, we will create a dataset with the heights of fathers and a randomly selected son of each family:\n\nlibrary(tidyverse)\nlibrary(HistData)\n\nset.seed(1983)\ngalton_heights &lt;- GaltonFamilies |&gt;\n  filter(gender == \"male\") |&gt;\n  group_by(family) |&gt;\n  sample_n(1) |&gt;\n  ungroup() |&gt;\n  select(father, childHeight) |&gt;\n  rename(son = childHeight)\n\nSuppose we were asked to summarize the father and son data. Since both distributions are well approximated by the normal distribution, we could use the two averages and two standard deviations as summaries:\n\ngalton_heights |&gt; \n  summarize(mean(father), sd(father), mean(son), sd(son))\n#&gt; # A tibble: 1 × 4\n#&gt;   `mean(father)` `sd(father)` `mean(son)` `sd(son)`\n#&gt;            &lt;dbl&gt;        &lt;dbl&gt;       &lt;dbl&gt;     &lt;dbl&gt;\n#&gt; 1           69.1         2.55        69.2      2.71\n\nHowever, this summary fails to describe an important characteristic of the data: the trend that the taller the father, the taller the son.\n\ngalton_heights |&gt; ggplot(aes(father, son)) + \n  geom_point(alpha = 0.5)\n\n\n\n\n\n\n\nWe will learn that the correlation coefficient is an informative summary of how two variables move together and then motivate simple regression by noting how this can be used to predict one variable using the other."
+    "title": "14  Regression",
+    "section": "\n14.1 Case study: is height hereditary?",
+    "text": "14.1 Case study: is height hereditary?\nTo understand the concepts of correlation and simple regression, we actually use the dataset from which regression was born. The example is from genetics. Francis Galton1 studied the variation and heredity of human traits. Among many other traits, Galton collected and studied height data from families to try to understand heredity. While doing this, he developed the concepts of correlation and regression, as well as a connection to pairs of data that follow a normal distribution. Of course, at the time this data was collected, our knowledge of genetics was quite limited compared to what we know today. A very specific question Galton tried to answer was: how well can we predict a child’s height based on the parents’ height? The technique he developed to answer this question, regression, can also be applied to our baseball question, as well as many other circumstances.\n\n\n\n\n\n\nGalton made important contributions to statistics and genetics, but he was also one of the first proponents of Eugenics, a scientifically flawed philosophical movement favored by many biologists of Galton’s time, but with horrific historical consequences. You can read more about it here: https://pged.org/history-eugenics-and-genetics/.\n\n\n\nWe have access to Galton’s family height data through the HistData package. This data contains heights on several dozen families: mothers, fathers, daughters, and sons. To imitate Galton’s analysis, we will create a dataset with the heights of fathers and a randomly selected son of each family:\n\nlibrary(tidyverse)\nlibrary(HistData)\n\nset.seed(1983)\ngalton_heights &lt;- GaltonFamilies |&gt;\n  filter(gender == \"male\") |&gt;\n  group_by(family) |&gt;\n  sample_n(1) |&gt;\n  ungroup() |&gt;\n  select(father, childHeight) |&gt;\n  rename(son = childHeight)\n\nSuppose we were asked to summarize the father and son data. Since both distributions are well approximated by the normal distribution, we could use the two averages and two standard deviations as summaries:\n\ngalton_heights |&gt; \n  summarize(mean(father), sd(father), mean(son), sd(son))\n#&gt; # A tibble: 1 × 4\n#&gt;   `mean(father)` `sd(father)` `mean(son)` `sd(son)`\n#&gt;            &lt;dbl&gt;        &lt;dbl&gt;       &lt;dbl&gt;     &lt;dbl&gt;\n#&gt; 1           69.1         2.55        69.2      2.71\n\nHowever, this summary fails to describe an important characteristic of the data: the trend that the taller the father, the taller the son.\n\ngalton_heights |&gt; ggplot(aes(father, son)) + \n  geom_point(alpha = 0.5)\n\n\n\n\n\n\n\nWe will learn that the correlation coefficient is an informative summary of how two variables move together and then motivate simple regression by noting how this can be used to predict one variable using the other."
   },
   {
     "objectID": "linear-models/regression.html#sec-corr-coef",
     "href": "linear-models/regression.html#sec-corr-coef",
-    "title": "13  Regression",
-    "section": "\n13.2 The correlation coefficient",
-    "text": "13.2 The correlation coefficient\nThe correlation coefficient is defined for a list of pairs \\((x_1, y_1), \\dots, (x_n,y_n)\\) as the average of the product of the standardized values:\n\\[\n\\rho = \\frac{1}{n} \\sum_{i=1}^n \\left( \\frac{x_i-\\mu_x}{\\sigma_x} \\right)\\left( \\frac{y_i-\\mu_y}{\\sigma_y} \\right)\n\\]\nwith \\(\\mu_x, \\mu_y\\) the averages of \\(x_1,\\dots, x_n\\) and \\(y_1, \\dots, y_n\\), respectively, and \\(\\sigma_x, \\sigma_y\\) the standard deviations. The Greek letter \\(\\rho\\) is commonly used in statistics books to denote the correlation. The Greek letter for \\(r\\), \\(\\rho\\), because it is the first letter of regression. Soon we learn about the connection between correlation and regression. We can represent the formula above with R code using:\n\nrho &lt;- mean(scale(x) * scale(y))\n\nTo understand why this equation does in fact summarize how two variables move together, consider the \\(i\\)-th entry of \\(x\\) is \\(\\left( \\frac{x_i-\\mu_x}{\\sigma_x} \\right)\\) SDs away from the average. Similarly, the \\(y_i\\) that is paired with \\(x_i\\), is \\(\\left( \\frac{y_1-\\mu_y}{\\sigma_y} \\right)\\) SDs away from the average \\(y\\). If \\(x\\) and \\(y\\) are unrelated, the product \\(\\left( \\frac{x_i-\\mu_x}{\\sigma_x} \\right)\\left( \\frac{y_i-\\mu_y}{\\sigma_y} \\right)\\) will be positive ( \\(+ \\times +\\) and \\(- \\times -\\) ) as often as negative (\\(+ \\times -\\) and \\(- \\times +\\)) and will average out to about 0. This correlation is the average and therefore unrelated variables will have 0 correlation. If instead the quantities vary together, then we are averaging mostly positive products ( \\(+ \\times +\\) and \\(- \\times -\\)) and we get a positive correlation. If they vary in opposite directions, we get a negative correlation.\nThe correlation coefficient is always between -1 and 1. We can show this mathematically: consider that we can’t have higher correlation than when we compare a list to itself (perfect correlation) and in this case the correlation is:\n\\[\n\\rho = \\frac{1}{n} \\sum_{i=1}^n \\left( \\frac{x_i-\\mu_x}{\\sigma_x} \\right)^2 =\n\\frac{1}{\\sigma_x^2} \\frac{1}{n} \\sum_{i=1}^n \\left( x_i-\\mu_x \\right)^2 =\n\\frac{1}{\\sigma_x^2} \\sigma^2_x =\n1\n\\]\nA similar derivation, but with \\(x\\) and its exact opposite, proves the correlation has to be bigger or equal to -1.\nFor other pairs, the correlation is in between -1 and 1. The correlation, computed with the function cor, between father and son’s heights is about 0.5:\n\ngalton_heights |&gt; summarize(r = cor(father, son)) |&gt; pull(r)\n#&gt; [1] 0.433\n\n\nFor reasons similar to those explained in Section Section 10.2.1 for the standard deviation, cor(x,y) divides by length(x)-1 rather than length(x).\n\nTo see what data looks like for different values of \\(\\rho\\), here are six examples of pairs with correlations ranging from -0.9 to 0.99:\n\n\n\n\n\n\n\n\n\n13.2.1 Sample correlation is a random variable\nBefore we continue connecting correlation to regression, let’s remind ourselves about random variability.\nIn most data science applications, we observe data that includes random variation. For example, in many cases, we do not observe data for the entire population of interest but rather for a random sample. As with the average and standard deviation, the sample correlation is the most commonly used estimate of the population correlation. This implies that the correlation we compute and use as a summary is a random variable.\nBy way of illustration, let’s assume that the 179 pairs of fathers and sons is our entire population. A less fortunate geneticist can only afford measurements from a random sample of 25 pairs. The sample correlation can be computed with:\n\nR &lt;- sample_n(galton_heights, 25, replace = TRUE) |&gt; \n  summarize(r = cor(father, son)) |&gt; pull(r)\n\nR is a random variable. We can run a Monte Carlo simulation to see its distribution:\n\nB &lt;- 1000\nN &lt;- 25\nR &lt;- replicate(B, {\n  sample_n(galton_heights, N, replace = TRUE) |&gt; \n    summarize(r = cor(father, son)) |&gt; \n    pull(r)\n})\nhist(R, breaks = 20)\n\n\n\n\n\n\n\nWe see that the expected value of R is the population correlation:\n\nmean(R)\n#&gt; [1] 0.431\n\nand that it has a relatively high standard error relative to the range of values R can take:\n\nsd(R)\n#&gt; [1] 0.161\n\nSo, when interpreting correlations, remember that correlations derived from samples are estimates containing uncertainty.\nAlso, note that because the sample correlation is an average of independent draws, the central limit actually applies. Therefore, for large enough \\(N\\), the distribution of R is approximately normal with expected value \\(\\rho\\). The standard deviation, which is somewhat complex to derive, is \\(\\sqrt{\\frac{1-r^2}{N-2}}\\).\nIn our example, \\(N=25\\) does not seem to be large enough to make the approximation a good one:\n\nggplot(aes(sample = R), data = data.frame(R)) + \n  stat_qq() + \n  geom_abline(intercept = mean(R), slope = sqrt((1 - mean(R)^2)/(N - 2)))\n\n\n\n\n\n\n\nIf you increase \\(N\\), you will see the distribution converging to normal.\n\n13.2.2 Correlation is not always a useful summary\nCorrelation is not always a good summary of the relationship between two variables. The following four artificial datasets, referred to as Anscombe’s quartet, famously illustrate this point. All these pairs have a correlation of 0.82:\n\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nCorrelation is only meaningful in a particular context. To help us understand when it is that correlation is meaningful as a summary statistic, we will return to the example of predicting a son’s height using his father’s height. This will help motivate and define linear regression. We start by demonstrating how correlation can be useful for prediction."
+    "title": "14  Regression",
+    "section": "\n14.2 The correlation coefficient",
+    "text": "14.2 The correlation coefficient\nThe correlation coefficient is defined for a list of pairs \\((x_1, y_1), \\dots, (x_n,y_n)\\) as the average of the product of the standardized values:\n\\[\n\\rho = \\frac{1}{n} \\sum_{i=1}^n \\left( \\frac{x_i-\\mu_x}{\\sigma_x} \\right)\\left( \\frac{y_i-\\mu_y}{\\sigma_y} \\right)\n\\]\nwith \\(\\mu_x, \\mu_y\\) the averages of \\(x_1,\\dots, x_n\\) and \\(y_1, \\dots, y_n\\), respectively, and \\(\\sigma_x, \\sigma_y\\) the standard deviations. The Greek letter for \\(r\\), \\(\\rho\\) is commonly used in statistics books to denote the correlation. It is not a coincidence that \\(r\\) is the first letter in “regression”. Soon we learn about the connection between correlation and regression.\nWe can represent the formula above with R code using:\n\nrho &lt;- mean(scale(x) * scale(y))\n\nTo understand why this equation does in fact summarize how two variables move together, consider the \\(i\\)-th entry of \\(x\\) is \\(\\left( \\frac{x_i-\\mu_x}{\\sigma_x} \\right)\\) SDs away from the average. Similarly, the \\(y_i\\) that is paired with \\(x_i\\), is \\(\\left( \\frac{y_1-\\mu_y}{\\sigma_y} \\right)\\) SDs away from the average \\(y\\). If \\(x\\) and \\(y\\) are unrelated, the product \\(\\left( \\frac{x_i-\\mu_x}{\\sigma_x} \\right)\\left( \\frac{y_i-\\mu_y}{\\sigma_y} \\right)\\) will be positive ( \\(+ \\times +\\) and \\(- \\times -\\) ) as often as negative (\\(+ \\times -\\) and \\(- \\times +\\)) and will average out to about 0. This correlation is the average and therefore unrelated variables will have 0 correlation. If instead the quantities vary together, then we are averaging mostly positive products (\\(+ \\times +\\) and \\(- \\times -\\)) and we get a positive correlation. If they vary in opposite directions, we get a negative correlation.\nThe correlation coefficient is always between -1 and 1. We can show this mathematically: consider that we can’t have higher correlation than when we compare a list to itself (perfect correlation) and, in this case, the correlation is:\n\\[\n\\rho = \\frac{1}{n} \\sum_{i=1}^n \\left( \\frac{x_i-\\mu_x}{\\sigma_x} \\right)^2 =\n\\frac{1}{\\sigma_x^2} \\frac{1}{n} \\sum_{i=1}^n \\left( x_i-\\mu_x \\right)^2 =\n\\frac{1}{\\sigma_x^2} \\sigma^2_x =\n1\n\\]\nA similar derivation, but with \\(x\\) and its exact opposite, proves the correlation has to be bigger or equal to -1.\nFor other pairs, the correlation is between -1 and 1. The correlation, computed with the function cor, between father and son’s heights is about 0.5:\n\ngalton_heights |&gt; summarize(r = cor(father, son)) |&gt; pull(r)\n#&gt; [1] 0.433\n\n\nThe function cor(x, y) computes the sample correlation, which divides the sum of products by length(x)-1 rather than length(x). The rationale for this is akin to the reason we divide by length(x)-1 when computing the sample standard deviation sd(x). Namely, this adjustment helps account for the degrees of freedom in the sample, which is necessary for unbiased estimates.\n\nTo see what data looks like for different values of \\(\\rho\\), here are six examples of pairs with correlations ranging from -0.9 to 0.99:\n\n\n\n\n\n\n\n\n\n14.2.1 Sample correlation is a random variable\nBefore we continue connecting correlation to regression, let’s remind ourselves about random variability.\nIn most data science applications, we observe data that includes random variation. For example, in many cases, we do not observe data for the entire population of interest, but rather for a random sample. As with the average and standard deviation, the sample correlation is the most commonly used estimate of the population correlation. This implies that the correlation we compute and use as a summary is a random variable.\nBy way of illustration, let’s assume that the 179 pairs of fathers and sons is our entire population. A less fortunate geneticist can only afford measurements from a random sample of 25 pairs. The sample correlation can be computed with:\n\nR &lt;- sample_n(galton_heights, 25, replace = TRUE) |&gt; \n  summarize(r = cor(father, son)) |&gt; pull(r)\n\nR is a random variable. We can run a Monte Carlo simulation to see its distribution:\n\nB &lt;- 1000\nN &lt;- 25\nR &lt;- replicate(B, {\n  sample_n(galton_heights, N, replace = TRUE) |&gt; \n    summarize(r = cor(father, son)) |&gt; \n    pull(r)\n})\nhist(R, breaks = 20)\n\n\n\n\n\n\n\nWe see that the expected value of R is the population correlation:\n\nmean(R)\n#&gt; [1] 0.431\n\nand that it has a relatively high standard error relative to the range of values R can take:\n\nsd(R)\n#&gt; [1] 0.161\n\nSo, when interpreting correlations, remember that correlations derived from samples are estimates containing uncertainty.\nAlso, note that because the sample correlation is an average of independent draws, the central limit actually applies. Therefore, for large enough \\(N\\), the distribution of R is approximately normal with expected value \\(\\rho\\). The standard deviation, which is somewhat complex to derive, is \\(\\sqrt{\\frac{1-r^2}{N-2}}\\).\nIn our example, \\(N=25\\) does not seem to be large enough to make the approximation a good one:\n\nggplot(aes(sample = R), data = data.frame(R)) + \n  stat_qq() + \n  geom_abline(intercept = mean(R), slope = sqrt((1 - mean(R)^2)/(N - 2)))\n\n\n\n\n\n\n\nIf you increase \\(N\\), you will see the distribution converging to normal.\n\n14.2.2 Correlation is not always a useful summary\nCorrelation is not always a good summary of the relationship between two variables. The following four artificial datasets, referred to as Anscombe’s quartet, famously illustrate this point. All these pairs have a correlation of 0.82:\n\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nCorrelation is only meaningful in a particular context. To help us understand when correlation is meaningful as a summary statistic, we return to the example of predicting a son’s height using his father’s height. This will help motivate and define linear regression. We start by demonstrating how correlation can be useful for prediction."
   },
   {
     "objectID": "linear-models/regression.html#sec-conditional-expectation",
     "href": "linear-models/regression.html#sec-conditional-expectation",
-    "title": "13  Regression",
-    "section": "\n13.3 Conditional expectations",
-    "text": "13.3 Conditional expectations\nSuppose we are asked to guess the height of a randomly selected son and we don’t know his father’s height. Because the distribution of sons’ heights is approximately normal, we know the average height, 69.2, is the value with the highest proportion and would be the prediction with the highest chance of minimizing the error. But what if we are told that the father is taller than average, say 72 inches tall, do we still guess 69.2 for the son?\nIt turns out that if we were able to collect data from a very large number of fathers that are 72 inches, the distribution of their sons’ heights would be normally distributed. This implies that the average of the distribution computed on this subset would be our best prediction.\nIn general, we call this approach conditioning. The general idea is that we stratify a population into groups and compute summaries in each group. To provide a mathematical description of conditioning, consider we have a population of pairs of values \\((x_1,y_1),\\dots,(x_n,y_n)\\), for example all father and son heights in England. In the previous chapter we learned that if you take a random pair \\((X,Y)\\), the expected value and best predictor of \\(Y\\) is \\(\\mbox{E}(Y) = \\mu_y\\), the population average \\(1/n\\sum_{i=1}^n y_i\\). However, we are no longer interested in the general population, instead we are interested in only the subset of a population with a specific \\(x_i\\) value, 72 inches in our example. This subset of the population, is also a population and thus the same principles and properties we have learned apply. The \\(y_i\\) in the subpopulation have a distribution, referred to as the conditional distribution, and this distribution has an expected value referred to as the conditional expectation. In our example, the conditional expectation is the average height of all sons in England with fathers that are 72 inches. The statistical notation for the conditional expectation is\n\\[\n\\mbox{E}(Y \\mid X = x)\n\\]\nwith \\(x\\) representing the fixed value that defines that subset, for example 72 inches. Similarly, we denote the standard deviation of the strata with\n\\[\n\\mbox{SD}(Y \\mid X = x) = \\sqrt{\\mbox{Var}(Y \\mid X = x)}\n\\]\nBecause the conditional expectation \\(E(Y\\mid X=x)\\) is the best predictor for the random variable \\(Y\\) for an individual in the strata defined by \\(X=x\\), many data science challenges reduce to estimating this quantity. The conditional standard deviation quantifies the precision of the prediction.\nIn the example we have been considering, we are interested in computing the average son height conditioned on the father being 72 inches tall. We want to estimate \\(E(Y|X=72)\\) using the sample collected by Galton. We previously learned that the sample average is the preferred approach to estimating the population average. However, a challenge when using this approach to estimating conditional expectations is that for continuous data we don’t have many data points matching exactly one value in our sample. For example, we have only:\n\nsum(galton_heights$father == 72)\n#&gt; [1] 8\n\nfathers that are exactly 72-inches. If we change the number to 72.5, we get even fewer data points:\n\nsum(galton_heights$father == 72.5)\n#&gt; [1] 1\n\nA practical way to improve these estimates of the conditional expectations, is to define strata of with similar values of \\(x\\). In our example, we can round father heights to the nearest inch and assume that they are all 72 inches. If we do this, we end up with the following prediction for the son of a father that is 72 inches tall:\n\nconditional_avg &lt;- galton_heights |&gt; \n  filter(round(father) == 72) |&gt;\n  summarize(avg = mean(son)) |&gt; \n  pull(avg)\nconditional_avg\n#&gt; [1] 70.5\n\nNote that a 72-inch father is taller than average – specifically, (72.0 - 69.1)/2.5 = 1.1 standard deviations taller than the average father. Our prediction 70.5 is also taller than average, but only 0.49 standard deviations larger than the average son. The sons of 72-inch fathers have regressed some to the average height. We notice that the reduction in how many SDs taller is about 0.5, which happens to be the correlation. As we will see in a later section, this is not a coincidence.\nIf we want to make a prediction of any height, not just 72, we could apply the same approach to each strata. Stratification followed by boxplots lets us see the distribution of each group:\n\ngalton_heights |&gt; mutate(father_strata = factor(round(father))) |&gt; \n  ggplot(aes(father_strata, son)) + \n  geom_boxplot() + \n  geom_point()\n\n\n\n\n\n\n\nNot surprisingly, the centers of the groups are increasing with height. Furthermore, these centers appear to follow a linear relationship. Below we plot the averages of each group. If we take into account that these averages are random variables with standard errors, the data is consistent with these points following a straight line:\n\n\n\n\n\n\n\n\nThe fact that these conditional averages follow a line is not a coincidence. In the next section, we explain that the line these averages follow is what we call the regression line, which improves the precision of our estimates. However, it is not always appropriate to estimate conditional expectations with the regression line so we also describe Galton’s theoretical justification for using the regression line."
+    "title": "14  Regression",
+    "section": "\n14.3 Conditional expectations",
+    "text": "14.3 Conditional expectations\nSuppose we are asked to guess the height of a randomly selected son and we don’t know his father’s height. Because the distribution of sons’ heights is approximately normal, we know the average height, 69.2, is the value with the highest proportion and would be the prediction with the highest chance of minimizing the error. But what if we are told that the father is taller than average, say 72 inches tall, do we still guess 69.2 for the son?\nIt turns out that, if we were able to collect data from a very large number of fathers that are 72 inches, the distribution of their sons’ heights would be normally distributed. This implies that the average of the distribution computed on this subset would be our best prediction.\nIn general, we call this approach conditioning. The general idea is that we stratify a population into groups and compute summaries in each group. To provide a mathematical description of conditioning, consider that we have a population of pairs of values \\((x_1,y_1),\\dots,(x_n,y_n)\\), for example all father and son heights in England. In the previous chapter, we learned that if you take a random pair \\((X,Y)\\), the expected value and best predictor of \\(Y\\) is \\(\\mbox{E}(Y) = \\mu_y\\), the population average \\(1/n\\sum_{i=1}^n y_i\\). However, we are no longer interested in the general population. Instead, we are interested in only the subset of a population with a specific \\(x_i\\) value, 72 inches in our example. This subset of the population is also a population, and thus, the same principles and properties we have learned apply. The \\(y_i\\) in the subpopulation have a distribution, referred to as the conditional distribution, and this distribution has an expected value referred to as the conditional expectation. In our example, the conditional expectation is the average height of all sons in England with fathers that are 72 inches. The statistical notation for the conditional expectation is:\n\\[\n\\mbox{E}(Y \\mid X = x)\n\\]\nwith \\(x\\) representing the fixed value that defines that subset, for example 72 inches. Similarly, we denote the standard deviation of the strata with:\n\\[\n\\mbox{SD}(Y \\mid X = x) = \\sqrt{\\mbox{Var}(Y \\mid X = x)}\n\\]\nBecause the conditional expectation \\(E(Y\\mid X=x)\\) is the best predictor for the random variable \\(Y\\) for an individual in the strata defined by \\(X=x\\), many data science challenges reduce to estimating this quantity. The conditional standard deviation quantifies the precision of the prediction.\nIn the example we have been considering, we are interested in computing the average son height conditioned on the father being 72 inches tall. We want to estimate \\(E(Y|X=72)\\) using the sample collected by Galton. We previously learned that the sample average is the preferred approach to estimating the population average. However, a challenge when using this approach to estimating conditional expectations is that, for continuous data, we don’t have many data points matching exactly one value in our sample. For example, we have only:\n\nsum(galton_heights$father == 72)\n#&gt; [1] 8\n\nfathers that are exactly 72 inches. If we change the number to 72.5, we get even fewer data points:\n\nsum(galton_heights$father == 72.5)\n#&gt; [1] 1\n\nA practical way to improve these estimates of the conditional expectations is to define strata of observations with similar value of \\(x\\). In our example, we can round father heights to the nearest inch and assume that they are all 72 inches. If we do this, we end up with the following prediction for the son of a father that is 72 inches tall:\n\nconditional_avg &lt;- galton_heights |&gt; \n  filter(round(father) == 72) |&gt;\n  summarize(avg = mean(son)) |&gt; \n  pull(avg)\nconditional_avg\n#&gt; [1] 70.5\n\nNote that a 72 inch father is taller than average, specifically (72.0 - 69.1)/2.5 = 1.1 standard deviations taller than the average father. Our prediction 70.5 is also taller than average, but only 0.49 standard deviations larger than the average son. The sons of 72 inch fathers have regressed some to the average height. We notice that the reduction in how many SDs taller is about 0.5, which happens to be the correlation. As we will see in a later section, this is not a coincidence.\nIf we want to make a prediction of any height, not just 72 inches, we could apply the same approach to each strata. Stratification followed by boxplots lets us see the distribution of each group:\n\ngalton_heights |&gt; mutate(father_strata = factor(round(father))) |&gt; \n  ggplot(aes(father_strata, son)) + \n  geom_boxplot() + \n  geom_point()\n\n\n\n\n\n\n\nNot surprisingly, the centers of the groups are increasing with height. Furthermore, these centers appear to follow a linear relationship. Below, we plot the averages of each group. If we take into account that these averages are random variables with standard errors, the data is consistent with these points following a straight line:\n\n\n\n\n\n\n\n\nThe fact that these conditional averages follow a line is not a coincidence. In the next section, we explain that the line these averages follow is what we call the regression line, which improves the precision of our estimates. However, it is not always appropriate to estimate conditional expectations with the regression line, so we also describe Galton’s theoretical justification for using the regression line."
   },
   {
     "objectID": "linear-models/regression.html#the-regression-line",
     "href": "linear-models/regression.html#the-regression-line",
-    "title": "13  Regression",
-    "section": "\n13.4 The regression line",
-    "text": "13.4 The regression line\nIf we are predicting a random variable \\(Y\\) knowing the value of another \\(X=x\\) using a regression line, then we predict that for every standard deviation, \\(\\sigma_X\\), that \\(x\\) increases above the average \\(\\mu_X\\), our prediction \\(\\hat{Y}\\) increase \\(\\rho\\) standard deviations \\(\\sigma_Y\\) above the average \\(\\mu_Y\\) with \\(\\rho\\) the correlation between \\(X\\) and \\(Y\\). The formula for the regression is therefore:\n\\[\n\\left( \\frac{\\hat{Y}-\\mu_Y}{\\sigma_Y} \\right) = \\rho \\left( \\frac{x-\\mu_X}{\\sigma_X} \\right)\n\\]\nWe can rewrite it like this:\n\\[\n\\hat{Y} = \\mu_Y + \\rho \\left( \\frac{x-\\mu_X}{\\sigma_X} \\right) \\sigma_Y\n\\]\nIf there is perfect correlation, the regression line predicts an increase that is the same number of SDs. If there is 0 correlation, then we don’t use \\(x\\) at all for the prediction and simply predict the average \\(\\mu_Y\\). For values between 0 and 1, the prediction is somewhere in between. If the correlation is negative, we predict a reduction instead of an increase.\nNote that if the correlation is positive and lower than 1, our prediction is closer, in standard units, to the average height than the value used to predict, \\(x\\), is to the average of the \\(x\\)s. This is why we call it regression: the son regresses to the average height. In fact, the title of Galton’s paper was: Regression toward mediocrity in hereditary stature. To add regression lines to plots, we will need the above formula in the form:\n\\[\n\\hat{Y} = b + mx \\mbox{ with slope } m = \\rho \\frac{\\sigma_y}{\\sigma_x} \\mbox{ and intercept } b=\\mu_y - m \\mu_x\n\\]\nHere we add the regression line to the original data:\n\nmu_x &lt;- mean(galton_heights$father)\nmu_y &lt;- mean(galton_heights$son)\ns_x &lt;- sd(galton_heights$father)\ns_y &lt;- sd(galton_heights$son)\nr &lt;- cor(galton_heights$father, galton_heights$son)\n\ngalton_heights |&gt; \n  ggplot(aes(father, son)) + \n  geom_point(alpha = 0.5) +\n  geom_abline(slope = r * s_y/s_x, intercept = mu_y - r * s_y/s_x * mu_x) \n\n\n\n\n\n\n\nThe regression formula implies that if we first standardize the variables, that is subtract the average and divide by the standard deviation, then the regression line has intercept 0 and slope equal to the correlation \\(\\rho\\). You can make same plot, but using standard units like this:\n\ngalton_heights |&gt; \n  ggplot(aes(scale(father), scale(son))) + \n  geom_point(alpha = 0.5) +\n  geom_abline(intercept = 0, slope = r)"
+    "title": "14  Regression",
+    "section": "\n14.4 The regression line",
+    "text": "14.4 The regression line\nIf we are predicting a random variable \\(Y\\) knowing the value of another \\(X=x\\) using a regression line, then we predict that for every standard deviation, \\(\\sigma_X\\), that \\(x\\) increases above the average \\(\\mu_X\\), our prediction \\(\\hat{Y}\\) increase \\(\\rho\\) standard deviations \\(\\sigma_Y\\) above the average \\(\\mu_Y\\) with \\(\\rho\\) the correlation between \\(X\\) and \\(Y\\). The formula for the regression is therefore:\n\\[\n\\left( \\frac{\\hat{Y}-\\mu_Y}{\\sigma_Y} \\right) = \\rho \\left( \\frac{x-\\mu_X}{\\sigma_X} \\right)\n\\]\nWe can rewrite it like this:\n\\[\n\\hat{Y} = \\mu_Y + \\rho \\left( \\frac{x-\\mu_X}{\\sigma_X} \\right) \\sigma_Y\n\\]\nIf there is perfect correlation, the regression line predicts an increase that is the same number of SDs. If there is 0 correlation, then we don’t use \\(x\\) at all for the prediction and simply predict the average \\(\\mu_Y\\). For values between 0 and 1, the prediction is somewhere in between. If the correlation is negative, we predict a reduction instead of an increase.\nNote that if the correlation is positive and lower than 1, our prediction is closer, in standard units, to the average height than the value used to predict, \\(x\\), is to the average of the \\(x\\)s. This is why we call it regression: the son regresses to the average height. In fact, the title of Galton’s paper was: Regression toward mediocrity in hereditary stature. To add regression lines to plots, we will need the above formula in the form:\n\\[\n\\hat{Y} = b + mx \\mbox{ with slope } m = \\rho \\frac{\\sigma_y}{\\sigma_x} \\mbox{ and intercept } b=\\mu_y - m \\mu_x\n\\]\nHere we add the regression line to the original data:\n\nmu_x &lt;- mean(galton_heights$father)\nmu_y &lt;- mean(galton_heights$son)\ns_x &lt;- sd(galton_heights$father)\ns_y &lt;- sd(galton_heights$son)\nr &lt;- cor(galton_heights$father, galton_heights$son)\n\ngalton_heights |&gt; \n  ggplot(aes(father, son)) + \n  geom_point(alpha = 0.5) +\n  geom_abline(slope = r * s_y/s_x, intercept = mu_y - r * s_y/s_x * mu_x) \n\n\n\n\n\n\n\nThe regression formula implies that if we first standardize the variables, that is subtract the average and divide by the standard deviation, then the regression line has intercept 0 and slope equal to the correlation \\(\\rho\\). You can make same plot, but using standard units like this:\n\ngalton_heights |&gt; \n  ggplot(aes(scale(father), scale(son))) + \n  geom_point(alpha = 0.5) +\n  geom_abline(intercept = 0, slope = r)"
   },
   {
     "objectID": "linear-models/regression.html#regression-improves-precision",
     "href": "linear-models/regression.html#regression-improves-precision",
-    "title": "13  Regression",
-    "section": "\n13.5 Regression improves precision",
-    "text": "13.5 Regression improves precision\nLet’s compare the two approaches to prediction that we have presented:\n\nRound fathers’ heights to closest inch, stratify, and then take the average.\nCompute the regression line and use it to predict.\n\nWe use a Monte Carlo simulation sampling \\(N=50\\) families:\n\nB &lt;- 1000\nN &lt;- 50\n\nset.seed(1983)\nconditional_avg &lt;- replicate(B, {\n  dat &lt;- sample_n(galton_heights, N)\n  dat |&gt; filter(round(father) == 72) |&gt; \n    summarize(avg = mean(son)) |&gt; \n    pull(avg)\n  })\n\nregression_prediction &lt;- replicate(B, {\n  dat &lt;- sample_n(galton_heights, N)\n  mu_x &lt;- mean(dat$father)\n  mu_y &lt;- mean(dat$son)\n  s_x &lt;- sd(dat$father)\n  s_y &lt;- sd(dat$son)\n  r &lt;- cor(dat$father, dat$son)\n  mu_y + r*(72 - mu_x)/s_x*s_y\n})\n\nAlthough the expected value of these two random variables is about the same:\n\nmean(conditional_avg, na.rm = TRUE)\n#&gt; [1] 70.5\nmean(regression_prediction)\n#&gt; [1] 70.5\n\nThe standard error for the regression prediction is substantially smaller:\n\nsd(conditional_avg, na.rm = TRUE)\n#&gt; [1] 0.964\nsd(regression_prediction)\n#&gt; [1] 0.452\n\nThe regression line is therefore much more stable than the conditional mean. There is an intuitive reason for this. The conditional average is computed on a relatively small subset: the fathers that are about 72 inches tall. In fact, in some of the permutations we have no data, which is why we use na.rm=TRUE. The regression always uses all the data.\nSo why not always use the regression for prediction? Because it is not always appropriate. For example, Anscombe provided cases for which the data does not have a linear relationship. So are we justified in using the regression line to predict? Galton answered this in the positive for height data. The justification, which we include in the next section, is somewhat more advanced than the rest of the chapter."
+    "title": "14  Regression",
+    "section": "\n14.5 Regression improves precision",
+    "text": "14.5 Regression improves precision\nLet’s compare the two approaches to prediction that we have presented:\n\nRound fathers’ heights to closest inch, stratify, and then take the average.\nCompute the regression line and use it to predict.\n\nWe use a Monte Carlo simulation sampling \\(N=50\\) families:\n\nB &lt;- 1000\nN &lt;- 50\n\nset.seed(1983)\nconditional_avg &lt;- replicate(B, {\n  dat &lt;- sample_n(galton_heights, N)\n  dat |&gt; filter(round(father) == 72) |&gt; \n    summarize(avg = mean(son)) |&gt; \n    pull(avg)\n  })\n\nregression_prediction &lt;- replicate(B, {\n  dat &lt;- sample_n(galton_heights, N)\n  mu_x &lt;- mean(dat$father)\n  mu_y &lt;- mean(dat$son)\n  s_x &lt;- sd(dat$father)\n  s_y &lt;- sd(dat$son)\n  r &lt;- cor(dat$father, dat$son)\n  mu_y + r*(72 - mu_x)/s_x*s_y\n})\n\nAlthough the expected value of these two random variables is about the same:\n\nmean(conditional_avg, na.rm = TRUE)\n#&gt; [1] 70.5\nmean(regression_prediction)\n#&gt; [1] 70.5\n\nThe standard error for the regression prediction is substantially smaller:\n\nsd(conditional_avg, na.rm = TRUE)\n#&gt; [1] 0.964\nsd(regression_prediction)\n#&gt; [1] 0.452\n\nThe regression line is therefore much more stable than the conditional mean. There is an intuitive reason for this. The conditional average is computed on a relatively small subset: the fathers that are about 72 inches tall. In fact, in some of the permutations we have no data, which is why we use na.rm=TRUE. The regression always uses all the data.\nSo why not always use the regression for prediction? Because it is not always appropriate. For example, Anscombe provided cases for which the data does not have a linear relationship. So are we justified in using the regression line to predict? Galton answered this in the positive for height data. The justification, which we include in the next section, is somewhat more advanced than the rest of the chapter."
   },
   {
     "objectID": "linear-models/regression.html#bivariate-normal-distribution",
     "href": "linear-models/regression.html#bivariate-normal-distribution",
-    "title": "13  Regression",
-    "section": "\n13.6 Bivariate normal distribution",
-    "text": "13.6 Bivariate normal distribution\nCorrelation and the regression slope are a widely used summary statistic, but they are often misused or misinterpreted. Anscombe’s examples provide over-simplified cases of dataset in which summarizing with correlation would be a mistake. But there are many more real-life examples.\nThe main way we motivate the use of correlation involves what is called the bivariate normal distribution.\nWhen a pair of random variables is approximated by the bivariate normal distribution, scatterplots look like ovals. As we saw in Section Section 13.2), they can be thin (high correlation) or circle-shaped (no correlation.\nA more technical way to define the bivariate normal distribution is the following: if \\(X\\) is a normally distributed random variable, \\(Y\\) is also a normally distributed random variable, and the conditional distribution of \\(Y\\) for any \\(X=x\\) is approximately normal, then the pair is approximately bivariate normal. When three or more variables have the property that each pair is bivariate normal, we say the variables follow a multivariate normal distribution or that they are jointly normal.\n\nor simply that the variables are jointly normal\n\n\nIf we think the height data is well approximated by the bivariate normal distribution, then we should see the normal approximation hold for each strata. Here we stratify the son heights by the standardized father heights and see that the assumption appears to hold:\n\ngalton_heights |&gt;\n  mutate(z_father = round((father - mean(father)) / sd(father))) |&gt;\n  filter(z_father %in% -2:2) |&gt;\n  ggplot() +  \n  stat_qq(aes(sample = son)) +\n  facet_wrap( ~ z_father) \n\n\n\n\n\n\n\nNow we come back to defining correlation. Galton used mathematical statistics to demonstrate that, when two variables follow a bivariate normal distribution, computing the regression line is equivalent to computing conditional expectations. We don’t show the derivation here, but we can show that under this assumption, for any given value of \\(x\\), the expected value of the \\(Y\\) in pairs for which \\(X=x\\) is:\n\\[\n\\mbox{E}(Y | X=x) = \\mu_Y +  \\rho \\frac{x-\\mu_X}{\\sigma_X}\\sigma_Y\n\\]\nThis is the regression line, with slope \\[\\rho \\frac{\\sigma_Y}{\\sigma_X}\\] and intercept \\(\\mu_y - m\\mu_X\\). It is equivalent to the regression equation we showed earlier which can be written like this:\n\\[\n\\frac{\\mbox{E}(Y \\mid X=x)  - \\mu_Y}{\\sigma_Y} = \\rho \\frac{x-\\mu_X}{\\sigma_X}\n\\]\nThis implies that, if our data is approximately bivariate, the regression line gives the conditional probability. Therefore, we can obtain a much more stable estimate of the conditional expectation by finding the regression line and using it to predict.\nIn summary, if our data is approximately bivariate, then the conditional expectation, the best prediction of \\(Y\\) given we know the value of \\(X\\), is given by the regression line."
+    "title": "14  Regression",
+    "section": "\n14.6 Bivariate normal distribution",
+    "text": "14.6 Bivariate normal distribution\nCorrelation and the regression slope are a widely used summary statistic, but they are often misused or misinterpreted. Anscombe’s examples provide over-simplified cases in which the correlation is not a useful summary. But there are many real-life examples.\nThe main way we motivate appropriate use of correlation as a summary, involves the bivariate normal distribution.\nWhen a pair of random variables is approximated by the bivariate normal distribution, scatterplots look like ovals. As we saw in Section 14.2, they can be thin (high correlation) or circle-shaped (no correlation).\nA more technical way to define the bivariate normal distribution is the following: if \\(X\\) is a normally distributed random variable, \\(Y\\) is also a normally distributed random variable, and the conditional distribution of \\(Y\\) for any \\(X=x\\) is approximately normal, then the pair is approximately bivariate normal.\nWhen three or more variables have the property that each pair is bivariate normal, we say the variables follow a multivariate normal distribution or that they are jointly normal.\nIf we think the height data is well approximated by the bivariate normal distribution, then we should see the normal approximation hold for each strata. Here we stratify the son heights by the standardized father heights and see that the assumption appears to hold:\n\ngalton_heights |&gt;\n  mutate(z_father = round((father - mean(father)) / sd(father))) |&gt;\n  filter(z_father %in% -2:2) |&gt;\n  ggplot() +  \n  stat_qq(aes(sample = son)) +\n  facet_wrap( ~ z_father) \n\n\n\n\n\n\n\nNow we come back to defining correlation. Galton used mathematical statistics to demonstrate that, when two variables follow a bivariate normal distribution, computing the regression line is equivalent to computing conditional expectations. We don’t show the derivation here, but we can show that under this assumption, for any given value of \\(x\\), the expected value of the \\(Y\\) in pairs for which \\(X=x\\) is:\n\\[\n\\mbox{E}(Y | X=x) = \\mu_Y +  \\rho \\frac{x-\\mu_X}{\\sigma_X}\\sigma_Y\n\\]\nThis is the regression line, with slope \\[\\rho \\frac{\\sigma_Y}{\\sigma_X}\\] and intercept \\(\\mu_y - m\\mu_X\\). It is equivalent to the regression equation we showed earlier which can be written like this:\n\\[\n\\frac{\\mbox{E}(Y \\mid X=x)  - \\mu_Y}{\\sigma_Y} = \\rho \\frac{x-\\mu_X}{\\sigma_X}\n\\]\nThis implies that, if our data is approximately bivariate, the regression line gives the conditional probability. Therefore, we can obtain a much more stable estimate of the conditional expectation by finding the regression line and using it to predict.\nIn summary, if our data is approximately bivariate, then the conditional expectation, the best prediction of \\(Y\\) given we know the value of \\(X\\), is given by the regression line."
   },
   {
     "objectID": "linear-models/regression.html#variance-explained",
     "href": "linear-models/regression.html#variance-explained",
-    "title": "13  Regression",
-    "section": "\n13.7 Variance explained",
-    "text": "13.7 Variance explained\nThe bivariate normal theory also tells us that the standard deviation of the conditional distribution described above is:\n\\[\n\\mbox{SD}(Y \\mid X=x ) = \\sigma_Y \\sqrt{1-\\rho^2}\n\\]\nTo see why this is intuitive, notice that without conditioning, \\(\\mbox{SD}(Y) = \\sigma_Y\\), we are looking at the variability of all the sons. But once we condition, we are only looking at the variability of the sons with a tall, 72-inch, father. This group will all tend to be somewhat tall so the standard deviation is reduced.\nSpecifically, it is reduced to \\(\\sqrt{1-\\rho^2} = \\sqrt{1 - 0.25}\\) = 0.87 of what it was originally. We could say that father heights “explain” 13% of the variability observed in son heights.\nThe statement “\\(X\\) explains such and such percent of the variability” is commonly used in academic papers. In this case, this percent actually refers to the variance (the SD squared). So if the data is bivariate normal, the variance is reduced by \\(1-\\rho^2\\), so we say that \\(X\\) explains \\(1- (1-\\rho^2)=\\rho^2\\) (the correlation squared) of the variance.\nBut it is important to remember that the “variance explained” statement only makes sense when the data is approximated by a bivariate normal distribution."
+    "title": "14  Regression",
+    "section": "\n14.7 Variance explained",
+    "text": "14.7 Variance explained\nThe bivariate normal theory also tells us that the standard deviation of the conditional distribution described above is:\n\\[\n\\mbox{SD}(Y \\mid X=x ) = \\sigma_Y \\sqrt{1-\\rho^2}\n\\]\nTo see why this is intuitive, notice that without conditioning, \\(\\mbox{SD}(Y) = \\sigma_Y\\), we are looking at the variability of all the sons. But once we condition, we are only looking at the variability of the sons with a tall, 72 inch father. This group will all tend to be somewhat tall so the standard deviation is reduced.\nSpecifically, it is reduced to \\(\\sqrt{1-\\rho^2} = \\sqrt{1 - 0.25}\\) = 0.87 of what it was originally. We could say that father heights “explain” 13% of the variability observed in son heights.\nThe statement “\\(X\\) explains such and such percent of the variability” is commonly used in academic papers. In this case, this percent actually refers to the variance (the SD squared). So if the data is bivariate normal, the variance is reduced by \\(1-\\rho^2\\), so we say that \\(X\\) explains \\(1- (1-\\rho^2)=\\rho^2\\) (the correlation squared) of the variance.\nBut it is important to remember that the “variance explained” statement only makes sense when the data is approximated by a bivariate normal distribution."
   },
   {
     "objectID": "linear-models/regression.html#there-are-two-regression-lines",
     "href": "linear-models/regression.html#there-are-two-regression-lines",
-    "title": "13  Regression",
-    "section": "\n13.8 There are two regression lines",
-    "text": "13.8 There are two regression lines\nWe computed a regression line to predict the son’s height from father’s height. We used these calculations:\n\nmu_x &lt;- mean(galton_heights$father)\nmu_y &lt;- mean(galton_heights$son)\ns_x &lt;- sd(galton_heights$father)\ns_y &lt;- sd(galton_heights$son)\nr &lt;- cor(galton_heights$father, galton_heights$son)\nm_1 &lt;-  r * s_y / s_x\nb_1 &lt;- mu_y - m_1*mu_x\n\nwhich gives us the function \\(\\mbox{E}(Y\\mid X=x) =\\) 37.3 + 0.46 \\(x\\).\nWhat if we want to predict the father’s height based on the son’s? It is important to know that this is not determined by computing the inverse function: \\(x = \\{ \\mbox{E}(Y\\mid X=x) -\\) 37.3 \\(\\} /\\) 0.5.\nWe need to compute \\(\\mbox{E}(X \\mid Y=y)\\). Since the data is approximately bivariate normal, the theory described above tells us that this conditional expectation will follow a line with slope and intercept:\n\nm_2 &lt;-  r * s_x / s_y\nb_2 &lt;- mu_x - m_2 * mu_y\n\nSo we get \\(\\mbox{E}(X \\mid Y=y) =\\) 40.9 + 0.41y. Again we see regression to the average: the prediction for the father is closer to the father average than the son heights \\(y\\) is to the son average.\nHere is a plot showing the two regression lines, with blue for the predicting son heights with father heights and red for predicting father heights with son heights:\n\ngalton_heights |&gt; \n  ggplot(aes(father, son)) + \n  geom_point(alpha = 0.5) + \n  geom_abline(intercept = b_1, slope = m_1, col = \"blue\") +\n  geom_abline(intercept = -b_2/m_2, slope = 1/m_2, col = \"red\")"
+    "title": "14  Regression",
+    "section": "\n14.8 There are two regression lines",
+    "text": "14.8 There are two regression lines\nWe computed a regression line to predict the son’s height from father’s height. We used these calculations:\n\nmu_x &lt;- mean(galton_heights$father)\nmu_y &lt;- mean(galton_heights$son)\ns_x &lt;- sd(galton_heights$father)\ns_y &lt;- sd(galton_heights$son)\nr &lt;- cor(galton_heights$father, galton_heights$son)\nm_1 &lt;-  r * s_y / s_x\nb_1 &lt;- mu_y - m_1*mu_x\n\nwhich gives us the function \\(\\mbox{E}(Y\\mid X=x) =\\) 37.3 + 0.46 \\(x\\).\nWhat if we want to predict the father’s height based on the son’s? It is important to know that this is not determined by computing the inverse function: \\(x = \\{ \\mbox{E}(Y\\mid X=x) -\\) 37.3 \\(\\} /\\) 0.5.\nWe need to compute \\(\\mbox{E}(X \\mid Y=y)\\). Since the data is approximately bivariate normal, the theory described earlier tells us that this conditional expectation will follow a line with slope and intercept:\n\nm_2 &lt;-  r * s_x / s_y\nb_2 &lt;- mu_x - m_2 * mu_y\n\nSo we get \\(\\mbox{E}(X \\mid Y=y) =\\) 40.9 + 0.41y. Again, we see regression to the average: the prediction for the father is closer to the father average than the son heights \\(y\\) is to the son average.\nHere is a plot showing the two regression lines, with blue for the predicting son heights with father heights, and red for predicting father heights with son heights:\n\ngalton_heights |&gt; \n  ggplot(aes(father, son)) + \n  geom_point(alpha = 0.5) + \n  geom_abline(intercept = b_1, slope = m_1, col = \"blue\") +\n  geom_abline(intercept = -b_2/m_2, slope = 1/m_2, col = \"red\")"
   },
   {
     "objectID": "linear-models/regression.html#linear-models",
     "href": "linear-models/regression.html#linear-models",
-    "title": "13  Regression",
-    "section": "\n13.9 Linear models",
-    "text": "13.9 Linear models\nWe are now ready to understand the title of this part of the book. Specifically, the connection between regression and linear models. We have described how if data is bivariate normal then the conditional expectations follow the regression line. The fact that the conditional expectation is a line is not an extra assumption but rather a derived result. However, in practice it is common to explicitly write down a model that describes the relationship between two or more variables using a linear model.\nWe note that linear here does not refer to lines exclusively, but rather to the fact that the conditional expectation is a linear combination of known quantities. In mathematics, when we multiply each variable by a constant and then add them together, we say we formed a linear combination of the variables. For example, \\(3x - 4y + 5z\\) is a linear combination of \\(x\\), \\(y\\), and \\(z\\). We can also add a constant so \\(2 + 3x - 4y + 5z\\) is also linear combination of \\(x\\), \\(y\\), and \\(z\\).\nWe previously described how if \\(X\\) and \\(Y\\) are bivariate normal, then if we look at only the pairs with \\(X=x\\), then \\(Y \\mid X=x\\) follows a normal distribution with expected value \\(\\mu_Y + \\rho \\frac{x-\\mu_X}{\\sigma_X}\\sigma_Y\\), which is a linear function of \\(x\\), and standard deviation \\(\\sigma_Y \\sqrt{1-\\rho^2}\\) that does not depend on \\(x\\). Note that if we write\n\\[\nY = \\beta_0 + \\beta_1 x + \\varepsilon\n\\]\nthen if we assume \\(\\varepsilon\\) follows a normal distribution with expected value 0 and fixed standard deviation, then \\(Y\\) has the same properties as the regression setup gave us: it follows a normal distribution, the expected value is a linear function \\(x\\), and the standard deviation does not depend on \\(x\\).\n\n\n\n\n\n\nIn statistical textbooks, the \\(\\varepsilon\\)s are referred to as “errors,” which originally represented measurement errors in the initial applications of these models. These errors were associated with inaccuracies in measuring height, weight, or distance. However, the term “error” is now used more broadly, even when the \\(\\varepsilon\\)s do not necessarily signify an actual error. For instance, in the case of height, if someone is 2 inches taller than expected based on their parents’ height, those 2 inches should not be considered an error. Despite its lack of descriptive accuracy, the term “error” is employed to elucidate the unexplained variability in the model, unrelated to other included terms.\n\n\n\nIf we were to specify a linear model for Galton’s data, we would denote the \\(N\\) observed father heights with \\(x_1, \\dots, x_n\\), then we model the \\(N\\) son heights we are trying to predict with:\n\\[\nY_i = \\beta_0 + \\beta_1 x_i + \\varepsilon_i, \\, i=1,\\dots,N.\n\\]\nHere \\(x_i\\) is the father’s height, which is fixed (not random) due to the conditioning, and \\(Y_i\\) is the random son’s height that we want to predict. We can further assume that \\(\\varepsilon_i\\) are independent from each other and all have the same standard deviation.\nIn the above model, we know the \\(x_i\\), but to have a useful model for prediction, we need \\(\\beta_0\\) and \\(\\beta_1\\). We estimate these from the data. Once we do this, we can predict son’s heights for any father’s height \\(x\\). We show how to do this in the next section.\nAlthough this model is exactly the same one we derived earlier by assuming bivariate normal data, a somewhat nuanced difference is that in the first approach we assumed the data was bivariate normal and the linear model was derived, not assumed. In practice, linear models are just assumed without necessarily assuming normality: the distribution of the \\(\\varepsilon\\)s is not necessarily specified. Nevertheless, if your data is bivariate normal, the above linear model holds. If your data is not bivariate normal, then you will need to have other ways of justifying the model.\nOne reason linear models are popular is that they are interpretable. In the case of Galton’s data, we can interpret the data like this: due to inherited genes, the son’s height prediction grows by \\(\\beta_1\\) for each inch we increase the father’s height \\(x\\). Because not all sons with fathers of height \\(x\\) are of equal height, we need the term \\(\\varepsilon\\), which explains the remaining variability. This remaining variability includes the mother’s genetic effect, environmental factors, and other biological randomness.\nGiven how we wrote the model above, the intercept \\(\\beta_0\\) is not very interpretable as it is the predicted height of a son with a father with no height. Due to regression to the mean, the prediction will usually be a bit larger than 0. To make the slope parameter more interpretable, we can rewrite the model slightly as:\n\\[\nY_i = \\beta_0 + \\beta_1 (x_i - \\bar{x}) + \\varepsilon_i, \\, i=1,\\dots,N\n\\]\nwith \\(\\bar{x} = 1/N \\sum_{i=1}^N x_i\\) the average of the \\(x\\). In this case \\(\\beta_0\\) represents the height when \\(x_i = \\bar{x}\\), which is the height of the son of an average father.\nLater, specifically in Chapters Chapter 14 and @treatment-effect-models, we will see how the linear model representation permits us to use the same mathematical frameworks in other contexts and to achieve more complicated goals than predict one variable from another."
+    "title": "14  Regression",
+    "section": "\n14.9 Linear models",
+    "text": "14.9 Linear models\nWe are now ready to understand the title of this part of the book. Specifically, the connection between regression and linear models. We have described how, if data is bivariate normal, then the conditional expectations follow the regression line. The fact that the conditional expectation is a line is not an extra assumption, but rather a derived result. However, in practice it is common to explicitly write down a model that describes the relationship between two or more variables using a linear model.\nWe note that linear here does not refer to lines exclusively, but rather to the fact that the conditional expectation is a linear combination of known quantities. In mathematics, when we multiply each variable by a constant and then add them together, we say we formed a linear combination of the variables. For example, \\(3x - 4y + 5z\\) is a linear combination of \\(x\\), \\(y\\), and \\(z\\). We can also add a constant so \\(2 + 3x - 4y + 5z\\) is also a linear combination of \\(x\\), \\(y\\), and \\(z\\).\nWe previously described how if \\(X\\) and \\(Y\\) are bivariate normal, then if we look at only the pairs with \\(X=x\\), then \\(Y \\mid X=x\\) follows a normal distribution with expected value \\(\\mu_Y + \\rho \\frac{x-\\mu_X}{\\sigma_X}\\sigma_Y\\), which is a linear function of \\(x\\), and standard deviation \\(\\sigma_Y \\sqrt{1-\\rho^2}\\) that does not depend on \\(x\\). Note that if we write:\n\\[\nY = \\beta_0 + \\beta_1 x + \\varepsilon\n\\]\nIf we assume \\(\\varepsilon\\) follows a normal distribution with expected value 0 and fixed standard deviation, then \\(Y\\) has the same properties as the regression setup gave us: it follows a normal distribution, the expected value is a linear function \\(x\\), and the standard deviation does not depend on \\(x\\).\n\n\n\n\n\n\nIn statistical textbooks, the \\(\\varepsilon\\)s are referred to as “errors,” which originally represented measurement errors in the initial applications of these models. These errors were associated with inaccuracies in measuring height, weight, or distance. However, the term “error” is now used more broadly, even when the \\(\\varepsilon\\)s do not necessarily signify an actual error. For instance, in the case of height, if someone is 2 inches taller than expected, based on their parents’ height, those 2 inches should not be considered an error. Despite its lack of descriptive accuracy, the term “error” is employed to elucidate the unexplained variability in the model, unrelated to other included terms.\n\n\n\nIf we were to specify a linear model for Galton’s data, we would denote the \\(N\\) observed father heights with \\(x_1, \\dots, x_n\\), then we model the \\(N\\) son heights we are trying to predict with:\n\\[\nY_i = \\beta_0 + \\beta_1 x_i + \\varepsilon_i, \\, i=1,\\dots,N.\n\\]\nHere \\(x_i\\) is the father’s height, which is fixed (not random) due to the conditioning, and \\(Y_i\\) is the random son’s height that we want to predict. We can further assume that \\(\\varepsilon_i\\) are independent from each other and all have the same standard deviation.\nIn the above model, we know the \\(x_i\\), but to have a useful model for prediction, we need \\(\\beta_0\\) and \\(\\beta_1\\). We estimate these from the data. Once we do this, we can predict son’s heights for any father’s height \\(x\\). We show how to do this in the next section.\nAlthough this model is exactly the same one we derived earlier by assuming bivariate normal data, a somewhat nuanced difference is that, in the first approach, we assumed the data was bivariate normal and the linear model was derived, not assumed. In practice, linear models are just assumed without necessarily assuming normality: the distribution of the \\(\\varepsilon\\)s is not necessarily specified. Nevertheless, if your data is bivariate normal, the above linear model holds. If your data is not bivariate normal, then you will need to have other ways of justifying the model.\nOne reason linear models are popular is that they are interpretable. In the case of Galton’s data, we can interpret the data like this: due to inherited genes, the son’s height prediction grows by \\(\\beta_1\\) for each inch we increase the father’s height \\(x\\). Because not all sons with fathers of height \\(x\\) are of equal height, we need the term \\(\\varepsilon\\), which explains the remaining variability. This remaining variability includes the mother’s genetic effect, environmental factors, and other biological randomness.\nGiven how we wrote the model above, the intercept \\(\\beta_0\\) is not very interpretable, as it is the predicted height of a son with a father with no height. Due to regression to the mean, the prediction will usually be a bit larger than 0. To make the slope parameter more interpretable, we can rewrite the model slightly as:\n\\[\nY_i = \\beta_0 + \\beta_1 (x_i - \\bar{x}) + \\varepsilon_i, \\, i=1,\\dots,N\n\\]\nwith \\(\\bar{x} = 1/N \\sum_{i=1}^N x_i\\) the average of the \\(x\\). In this case, \\(\\beta_0\\) represents the height when \\(x_i = \\bar{x}\\), which is the height of the son of an average father.\nLater, specifically in Sections Chapter 15 and @treatment-effect-models, we will see how the linear model representation permits us to use the same mathematical frameworks in other contexts and to achieve more complicated goals than predict one variable from another."
   },
   {
     "objectID": "linear-models/regression.html#sec-lse",
     "href": "linear-models/regression.html#sec-lse",
-    "title": "13  Regression",
-    "section": "\n13.10 Least Squares Estimates",
-    "text": "13.10 Least Squares Estimates\nFor linear models to be useful, we have to estimate the unknown \\(\\beta\\)s. The standard approach in science is to find the values that minimize the distance of the fitted model to the data. The following is called the least squares (LS) equation and we will see it often in this chapter. For Galton’s data, we would write:\n\\[\nRSS = \\sum_{i=1}^n \\left\\{  y_i - \\left(\\beta_0 + \\beta_1 x_i \\right)\\right\\}^2\n\\]\nThis quantity is called the residual sum of squares (RSS). Once we find the values that minimize the RSS, we will call the values the least squares estimates (LSE) and denote them with \\(\\hat{\\beta}_0\\) and \\(\\hat{\\beta}_1\\). Let’s demonstrate this with the previously defined dataset:\n\nlibrary(HistData)\nset.seed(1983)\ngalton_heights &lt;- GaltonFamilies |&gt;\n  filter(gender == \"male\") |&gt;\n  group_by(family) |&gt;\n  sample_n(1) |&gt;\n  ungroup() |&gt;\n  select(father, childHeight) |&gt;\n  rename(son = childHeight)\n\nLet’s write a function that computes the RSS for any pair of values \\(\\beta_0\\) and \\(\\beta_1\\).\n\nrss &lt;- function(beta0, beta1, data){\n  resid &lt;- galton_heights$son - (beta0 + beta1*galton_heights$father)\n  return(sum(resid^2))\n}\n\nSo for any pair of values, we get an RSS. Here is a plot of the RSS as a function of \\(\\beta_1\\) when we keep the \\(\\beta_0\\) fixed at 25.\n\nbeta1 = seq(0, 1, length = nrow(galton_heights))\nresults &lt;- data.frame(beta1 = beta1,\n                      rss = sapply(beta1, rss, beta0 = 25))\nresults |&gt; ggplot(aes(beta1, rss)) + geom_line() + \n  geom_line(aes(beta1, rss))\n\n\n\n\n\n\n\nWe can see a clear minimum for \\(\\beta_1\\) at around 0.65. However, this minimum for \\(\\beta_1\\) is for when \\(\\beta_0 = 25\\), a value we arbitrarily picked. We don’t know if (25, 0.65) is the pair that minimizes the equation across all possible pairs.\nTrial and error is not going to work in this case. We could search for a minimum within a fine grid of \\(\\beta_0\\) and \\(\\beta_1\\) values, but this is unnecessarily time-consuming since we can use calculus: take the partial derivatives, set them to 0 and solve for \\(\\beta_1\\) and \\(\\beta_2\\). Of course, if we have many parameters, these equations can get rather complex. But there are functions in R that do these calculations for us. We will learn these next. To learn the mathematics behind this, you can consult a book on linear models."
+    "title": "14  Regression",
+    "section": "\n14.10 Least Squares Estimates",
+    "text": "14.10 Least Squares Estimates\nFor linear models to be useful, we have to estimate the unknown \\(\\beta\\)s. The standard approach is to find the values that minimize the distance of the fitted model to the data. Specifically, we find the \\(\\beta\\)s that minize the least squares (LS) equation show below. For Galton’s data, the LS equation looks like this:\n\\[\nRSS = \\sum_{i=1}^n \\left\\{  y_i - \\left(\\beta_0 + \\beta_1 x_i \\right)\\right\\}^2\n\\]\nThe quantity we try to minimize is called the residual sum of squares (RSS).\nOnce we find the values that minimize the RSS, we will call the values the least squares estimates (LSE) and denote them by placing a hat over the parameters. In our example we use \\(\\hat{\\beta}_0\\) and \\(\\hat{\\beta}_1\\).\nWe will demonstrate how we find these values using the previously defined galton_heights dataset. Let’s start bywriting a function that computes the RSS for any pair of values \\(\\beta_0\\) and \\(\\beta_1\\).\n\nrss &lt;- function(beta0, beta1, data){\n  resid &lt;- galton_heights$son - (beta0 + beta1*galton_heights$father)\n  return(sum(resid^2))\n}\n\nSo for any pair of values, we get an RSS. Here is a plot of the RSS as a function of \\(\\beta_1\\), when we keep the \\(\\beta_0\\) fixed at 25.\n\nbeta1 = seq(0, 1, length = nrow(galton_heights))\nresults &lt;- data.frame(beta1 = beta1,\n                      rss = sapply(beta1, rss, beta0 = 25))\nresults |&gt; ggplot(aes(beta1, rss)) + geom_line() + \n  geom_line(aes(beta1, rss))\n\n\n\n\n\n\n\nWe can see a clear minimum for \\(\\beta_1\\) at around 0.65. However, this minimum for \\(\\beta_1\\) is for when \\(\\beta_0 = 25\\), a value we arbitrarily picked. We don’t know if (25, 0.65) is the pair that minimizes the equation across all possible pairs.\nTrial and error is not going to work in this case. We could search for a minimum within a fine grid of \\(\\beta_0\\) and \\(\\beta_1\\) values, but this is unnecessarily time-consuming since we can use calculus. Specifically, we take the partial derivatives, set them to 0, and solve for \\(\\beta_1\\) and \\(\\beta_2\\). Of course, if we have many parameters, these equations can get rather complex. But there are functions in R that do these calculations for us. We will study these next. To learn the mathematics behind this, you can consult a book on linear models."
   },
   {
     "objectID": "linear-models/regression.html#the-lm-function",
     "href": "linear-models/regression.html#the-lm-function",
-    "title": "13  Regression",
-    "section": "\n13.11 The lm function",
-    "text": "13.11 The lm function\nIn R, we can obtain the least squares estimates using the lm function. To fit the model:\n\\[\nY_i = \\beta_0 + \\beta_1 x_i + \\varepsilon_i\n\\]\nwith \\(Y_i\\) the son’s height and \\(x_i\\) the father’s height, we can use this code to obtain the least squares estimates.\n\nfit &lt;- lm(son ~ father, data = galton_heights)\nfit$coef\n#&gt; (Intercept)      father \n#&gt;      37.288       0.461\n\nThe most common way we use lm is by using the character ~ to let lm know which is the variable we are predicting (left of ~) and which we are using to predict (right of ~). The intercept is added automatically to the model that will be fit.\nThe object fit includes more information about the fit. We can use the function summary to extract more of this information (not shown):\n\nsummary(fit)\n#&gt; \n#&gt; Call:\n#&gt; lm(formula = son ~ father, data = galton_heights)\n#&gt; \n#&gt; Residuals:\n#&gt;    Min     1Q Median     3Q    Max \n#&gt; -9.354 -1.566 -0.008  1.726  9.415 \n#&gt; \n#&gt; Coefficients:\n#&gt;             Estimate Std. Error t value Pr(&gt;|t|)    \n#&gt; (Intercept)  37.2876     4.9862    7.48  3.4e-12 ***\n#&gt; father        0.4614     0.0721    6.40  1.4e-09 ***\n#&gt; ---\n#&gt; Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#&gt; \n#&gt; Residual standard error: 2.45 on 177 degrees of freedom\n#&gt; Multiple R-squared:  0.188,  Adjusted R-squared:  0.183 \n#&gt; F-statistic: 40.9 on 1 and 177 DF,  p-value: 1.36e-09\n\nTo understand some of the information included in this summary we need to remember that the LSE are random variables. Mathematical statistics gives us some ideas of the distribution of these random variables.\nIn Chapter @, after describing a more complex case study, we learn more about applying regression in R."
+    "title": "14  Regression",
+    "section": "\n14.11 The lm function",
+    "text": "14.11 The lm function\nIn R, we can obtain the least squares estimates using the lm function. To fit the model:\n\\[\nY_i = \\beta_0 + \\beta_1 x_i + \\varepsilon_i\n\\]\nwith \\(Y_i\\) being the son’s height and \\(x_i\\) being the father’s height, we can use this code to obtain the least squares estimates.\n\nfit &lt;- lm(son ~ father, data = galton_heights)\nfit$coefficients\n#&gt; (Intercept)      father \n#&gt;      37.288       0.461\n\nThe most common way we use lm is by using the character ~ to let lm know which is the variable we are predicting (left of ~) and which we are using to predict (right of ~). The intercept is added automatically to the model that will be fit.\nThe object fit includes more information about the fit. We can use the function summary to extract more of this information (not shown):\n\nsummary(fit)\n#&gt; \n#&gt; Call:\n#&gt; lm(formula = son ~ father, data = galton_heights)\n#&gt; \n#&gt; Residuals:\n#&gt;    Min     1Q Median     3Q    Max \n#&gt; -9.354 -1.566 -0.008  1.726  9.415 \n#&gt; \n#&gt; Coefficients:\n#&gt;             Estimate Std. Error t value Pr(&gt;|t|)    \n#&gt; (Intercept)  37.2876     4.9862    7.48  3.4e-12 ***\n#&gt; father        0.4614     0.0721    6.40  1.4e-09 ***\n#&gt; ---\n#&gt; Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n#&gt; \n#&gt; Residual standard error: 2.45 on 177 degrees of freedom\n#&gt; Multiple R-squared:  0.188,  Adjusted R-squared:  0.183 \n#&gt; F-statistic: 40.9 on 1 and 177 DF,  p-value: 1.36e-09\n\nTo understand some of the information included in this summary, we need to remember that the LSE are random variables. Mathematical statistics gives us some ideas of the distribution of these random variables.\nIn Chapter 15, after describing a more complex case study, we gain further insights into the application of regression in R."
   },
   {
     "objectID": "linear-models/regression.html#lse-are-random-variables",
     "href": "linear-models/regression.html#lse-are-random-variables",
-    "title": "13  Regression",
-    "section": "\n13.12 LSE are random variables",
-    "text": "13.12 LSE are random variables\nThe LSE is derived from the data \\(y_1,\\dots,y_N\\), which are a realization of random variables \\(Y_1, \\dots, Y_N\\). This implies that our estimates are random variables. To see this, we can run a Monte Carlo simulation in which we assume the son and father height data defines a population, take a random sample of size \\(N=50\\), and compute the regression slope coefficient for each one:\n\nB &lt;- 1000\nN &lt;- 50\nlse &lt;- replicate(B, {\n  sample_n(galton_heights, N, replace = TRUE) |&gt; \n    lm(son ~ father, data = _) |&gt; \n    coef()\n})\nlse &lt;- data.frame(beta_0 = lse[1,], beta_1 = lse[2,]) \n\nWe can see the variability of the estimates by plotting their distributions:\n\n#&gt; \n#&gt; Attaching package: 'gridExtra'\n#&gt; The following object is masked from 'package:dplyr':\n#&gt; \n#&gt;     combine\n\n\n\n\n\n\n\nThe reason these look normal is because the central limit theorem applies here as well: for large enough \\(N\\), the least squares estimates will be approximately normal with expected value \\(\\beta_0\\) and \\(\\beta_1\\), respectively. The standard errors are a bit complicated to compute, but mathematical theory does allow us to compute them and they are included in the summary provided by the lm function. Here it is for one of our simulated data sets:\n\nsample_n(galton_heights, N, replace = TRUE) |&gt; \n  lm(son ~ father, data = _) |&gt; \n  summary() |&gt; \n  coef()\n#&gt;             Estimate Std. Error t value Pr(&gt;|t|)\n#&gt; (Intercept)    19.28     11.656    1.65 1.05e-01\n#&gt; father          0.72      0.169    4.25 9.79e-05\n\nYou can see that the standard errors estimates reported by the summary are close to the standard errors from the simulation:\n\nlse |&gt; summarize(se_0 = sd(beta_0), se_1 = sd(beta_1))\n#&gt;   se_0  se_1\n#&gt; 1 8.84 0.128\n\nThe summary function also reports t-statistics (t value) and p-values (Pr(&gt;|t|)). The t-statistic is not actually based on the central limit theorem but rather on the assumption that the \\(\\varepsilon\\)s follow a normal distribution. Under this assumption, mathematical theory tells us that the LSE divided by their standard error, \\(\\hat{\\beta}_0 / \\hat{\\mbox{SE}}(\\hat{\\beta}_0 )\\) and \\(\\hat{\\beta}_1 / \\hat{\\mbox{SE}}(\\hat{\\beta}_1 )\\), follow a t-distribution with \\(N-p\\) degrees of freedom, with \\(p\\) the number of parameters in our model. In the case of height \\(p=2\\), the two p-values are testing the null hypothesis that \\(\\beta_0 = 0\\) and \\(\\beta_1=0\\), respectively.\nRemember that, as we described in Section Section 10.2.3 for large enough \\(N\\), the CLT works and the t-distribution becomes almost the same as the normal distribution. Also, notice that we can construct confidence intervals, but we will soon learn about broom, an add-on package that makes this easy.\nAlthough we do not show examples in this book, hypothesis testing with regression models is commonly used in epidemiology and economics to make statements such as “the effect of A on B was statistically significant after adjusting for X, Y, and Z”. However, several assumptions have to hold for these statements to be true."
+    "title": "14  Regression",
+    "section": "\n14.12 LSE are random variables",
+    "text": "14.12 LSE are random variables\nThe LSE is derived from the data \\(y_1,\\dots,y_N\\), which are a realization of random variables \\(Y_1, \\dots, Y_N\\). This implies that our estimates are random variables. To see this, we can run a Monte Carlo simulation in which we assume the son and father height data defines a population, take a random sample of size \\(N=50\\), and compute the regression slope coefficient for each one:\n\nB &lt;- 1000\nN &lt;- 50\nlse &lt;- replicate(B, {\n  sample_n(galton_heights, N, replace = TRUE) |&gt; \n    lm(son ~ father, data = _) |&gt; \n    coef()\n})\nlse &lt;- data.frame(beta_0 = lse[1,], beta_1 = lse[2,]) \n\nWe can see the variability of the estimates by plotting their distributions:\n\n#&gt; \n#&gt; Attaching package: 'gridExtra'\n#&gt; The following object is masked from 'package:dplyr':\n#&gt; \n#&gt;     combine\n\n\n\n\n\n\n\nThe reason these look normal is because the central limit theorem applies here as well: for large enough \\(N\\), the least squares estimates will be approximately normal with expected value \\(\\beta_0\\) and \\(\\beta_1\\), respectively.\nThe standard errors are a bit complicated to compute, but mathematical theory does allow us to compute them and they are included in the summary provided by the lm function. The function summary shows us the standard error estimates:\n\nsample_n(galton_heights, N, replace = TRUE) |&gt; \n  lm(son ~ father, data = _) |&gt; \n  summary() |&gt; \n  coef()\n#&gt;             Estimate Std. Error t value Pr(&gt;|t|)\n#&gt; (Intercept)    19.28     11.656    1.65 1.05e-01\n#&gt; father          0.72      0.169    4.25 9.79e-05\n\nYou can see that the standard errors estimates reported above are close to the standard errors from the simulation:\n\nlse |&gt; summarize(se_0 = sd(beta_0), se_1 = sd(beta_1))\n#&gt;   se_0  se_1\n#&gt; 1 8.84 0.128\n\nThe summary function also reports t-statistics (t value) and p-values (Pr(&gt;|t|)). The t-statistic is not actually based on the central limit theorem, but rather on the assumption that the \\(\\varepsilon\\)s follow a normal distribution. Under this assumption, mathematical theory tells us that the LSE divided by their standard error, \\(\\hat{\\beta}_0 / \\hat{\\mbox{SE}}(\\hat{\\beta}_0 )\\) and \\(\\hat{\\beta}_1 / \\hat{\\mbox{SE}}(\\hat{\\beta}_1 )\\), follow a t-distribution with \\(N-p\\) degrees of freedom, with \\(p\\) the number of parameters in our model. In our example \\(p=2\\), and the two p-values are obtained from testing the null hypothesis that \\(\\beta_0 = 0\\) and \\(\\beta_1=0\\), respectively.\nRemember that, as we described in Section Section 11.2.3, for large enough \\(N\\), the CLT works and the t-distribution becomes almost the same as the normal distribution. Also, notice that we can construct confidence intervals, but we will soon learn about broom, an add-on package that makes this easy.\nAlthough we do not show examples in this book, hypothesis testing with regression models is commonly used in epidemiology and economics to make statements such as “the effect of A on B was statistically significant after adjusting for X, Y, and Z”. However, several assumptions have to hold for these statements to be true."
   },
   {
     "objectID": "linear-models/regression.html#predicted-values-are-random-variables",
     "href": "linear-models/regression.html#predicted-values-are-random-variables",
-    "title": "13  Regression",
-    "section": "\n13.13 Predicted values are random variables",
-    "text": "13.13 Predicted values are random variables\nOnce we fit our model, we can obtain prediction of \\(Y\\) by plugging in the estimates into the regression model. For example, if the father’s height is \\(x\\), then our prediction \\(\\hat{Y}\\) for the son’s height will be:\n\\[\\hat{Y} = \\hat{\\beta}_0 + \\hat{\\beta}_1 x\\]\nWhen we plot \\(\\hat{Y}\\) versus \\(x\\), we see the regression line.\nKeep in mind that the prediction \\(\\hat{Y}\\) is also a random variable and mathematical theory tells us what the standard errors are. If we assume the errors are normal, or have a large enough sample size, we can use theory to construct confidence intervals as well. In fact, the ggplot2 layer geom_smooth(method = \"lm\") that we previously used plots \\(\\hat{Y}\\) and surrounds it by confidence intervals:\n\ngalton_heights |&gt; ggplot(aes(son, father)) +\n  geom_point() +\n  geom_smooth(method = \"lm\")\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nThe R function predict takes an lm object as input and returns the prediction. If requested, the standard errors and other information from which we can construct confidence intervals is provided:\n\nfit &lt;- galton_heights |&gt; lm(son ~ father, data = _) \n\ny_hat &lt;- predict(fit, se.fit = TRUE)\n\nnames(y_hat)\n#&gt; [1] \"fit\"            \"se.fit\"         \"df\"             \"residual.scale\""
+    "title": "14  Regression",
+    "section": "\n14.13 Predicted values are random variables",
+    "text": "14.13 Predicted values are random variables\nOnce we fit our model, we can obtain prediction of \\(Y\\) by plugging in the estimates into the regression model. For example, if the father’s height is \\(x\\), then our prediction \\(\\hat{Y}\\) for the son’s height will be:\n\\[\\hat{Y} = \\hat{\\beta}_0 + \\hat{\\beta}_1 x\\]\nWhen we plot \\(\\hat{Y}\\) versus \\(x\\), we see the regression line.\nKeep in mind that the prediction \\(\\hat{Y}\\) is also a random variable and mathematical theory tells us what the standard errors are. If we assume the errors are normal, or have a large enough sample size, we can use theory to construct confidence intervals as well. In fact, the ggplot2 layer geom_smooth(method = \"lm\") that we previously used plots \\(\\hat{Y}\\) and surrounds it by confidence intervals:\n\ngalton_heights |&gt; ggplot(aes(son, father)) +\n  geom_point() +\n  geom_smooth(method = \"lm\")\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nThe R function predict takes an lm object as input and returns the prediction. If requested, the standard errors and other information from which we can construct confidence intervals is provided:\n\nfit &lt;- galton_heights |&gt; lm(son ~ father, data = _) \n\ny_hat &lt;- predict(fit, se.fit = TRUE)\n\nnames(y_hat)\n#&gt; [1] \"fit\"            \"se.fit\"         \"df\"             \"residual.scale\""
   },
   {
     "objectID": "linear-models/regression.html#diagnostic-plots",
     "href": "linear-models/regression.html#diagnostic-plots",
-    "title": "13  Regression",
-    "section": "\n13.14 Diagnostic plots",
-    "text": "13.14 Diagnostic plots\nWhen the linear model is assumed rather than derived, all interpretations depend on the usefulness of the model. The lm function will fit the model and return summaries even when the model is wrong and unuseful.\nVisually inspecting residuals, defined as the difference between observed values and predicted values\n\\[\nr = Y - \\hat{Y} = Y - \\left(\\hat{\\beta}_0 - \\hat{\\beta}_1 x_i\\right),\n\\] and summaries of the residuals, is a powerful way to diagnose if the model is useful. Note that the residuals can be thought of estimates of the errors since\n\\[\n\\varepsilon = Y - \\left(\\beta_0 + \\beta_1 x_i \\right).\n\\] In fact residuals are often denoted as \\(\\hat{\\varepsilon}\\). This motivates several diagnostic plots. Becasue we obervere, \\(r\\) but don’t observe \\(\\varepsilon\\), we based the plots on the residuals.\n\nBecause the errors are assumed not to depend on the expected value of \\(Y\\), a plot of \\(r\\) versus the fitted values \\(\\hat{Y}\\) should show no relationship.\nIn cases in which we assume the errors follow a normal distribtuion a qqplot of standardized \\(r\\) should fall on a line when plotted against theoretical quantiles.\nBecause we assume the standard deviation of the errors is constant, if we plot the absolute value of the residuals, it should appear constant.\n\nWe prefer plots rather than summaries based on, for example, correlation because, as noted in Section @ascombe, correlation is not always the best summary of association. The function plot applied to an lm object automatically plots these.\n\n\n\n\n\n\n\n\n\nplot(fit, which = 1:3)\n\nThis function can produce six different plots, and the argument which let’s you specify which you want to see. You can learn more by reading the plot.lm help file. However, some of the plots are based on more advanced concepts beyond the scope of this book. To learn more we recommend an advanced book on regression analysis.\nIn Chapters Chapter 14 and Chapter 16 we introduce data analysis challenges in which more than one variables some not included in the model. In these cases an important diagnostic test to add checks if the residuals are related to variables not included in the model."
+    "title": "14  Regression",
+    "section": "\n14.14 Diagnostic plots",
+    "text": "14.14 Diagnostic plots\nWhen the linear model is assumed, rather than derived, all interpretations depend on the usefulness of the model. The lm function will fit the model and return summaries even when the model is wrong and not useful.\nVisually inspecting residuals, defined as the difference between observed values and predicted values:\n\\[\nr = Y - \\hat{Y} = Y - \\left(\\hat{\\beta}_0 - \\hat{\\beta}_1 x_i\\right),\n\\] and summaries of the residuals, is a powerful way to diagnose if the model is useful. Note that the residuals can be thought of estimates of the errors since:\n\\[\n\\varepsilon = Y - \\left(\\beta_0 + \\beta_1 x_i \\right).\n\\] In fact residuals are often denoted as \\(\\hat{\\varepsilon}\\). This motivates several diagnostic plots. Because we observe \\(r\\), but don’t observe \\(\\varepsilon\\), we based the plots on the residuals.\n\nBecause the errors are assumed not to depend on the expected value of \\(Y\\), a plot of \\(r\\) versus the fitted values \\(\\hat{Y}\\) should show no relationship.\nIn cases in which we assume the errors follow a normal distribution, a qqplot of standardized \\(r\\) should fall on a line when plotted against theoretical quantiles.\nBecause we assume the standard deviation of the errors is constant, if we plot the absolute value of the residuals, it should appear constant.\n\nWe prefer plots rather than summaries based on, for example, correlation because, as noted in Section @ascombe, correlation is not always the best summary of association. The function plot applied to an lm object automatically plots these.\n\n\n\n\n\n\n\n\n\nplot(fit, which = 1:3)\n\nThis function can produce six different plots, and the argument which let’s you specify which you want to see. You can learn more by reading the plot.lm help file. However, some of the plots are based on more advanced concepts beyond the scope of this book. To learn more, we recommend an advanced book on regression analysis.\nIn Chapter 15 and Chapter 17, we introduce data analysis challenges in which we may decide to not to include certain variables in the model. In these cases, an important diagnostic test to add checks if the residuals are related to variables not included in the model."
   },
   {
     "objectID": "linear-models/regression.html#the-regression-fallacy",
     "href": "linear-models/regression.html#the-regression-fallacy",
-    "title": "13  Regression",
-    "section": "\n13.15 The regression fallacy",
-    "text": "13.15 The regression fallacy\nWikipedia defines the sophomore slump as:\n\nA sophomore slump or sophomore jinx or sophomore jitters refers to an instance in which a second, or sophomore, effort fails to live up to the standards of the first effort. It is commonly used to refer to the apathy of students (second year of high school, college or university), the performance of athletes (second season of play), singers/bands (second album), television shows (second seasons) and movies (sequels/prequels).\n\nIn Major League Baseball, the rookie of the year (ROY) award is given to the first-year player who is judged to have performed the best. The sophmore slump phrase is used to describe the observation that ROY award winners don’t do as well during their second year. For example, this Fox Sports article2 asks “Will MLB’s tremendous rookie class of 2015 suffer a sophomore slump?”.\nDoes the data confirm the existence of a sophomore slump? Let’s take a look. Examining the data for widely used measure of success, the batting average, we see that this observation holds true for the top performing ROYs:\n\n\n\n\nnameFirst\nnameLast\nrookie_year\nrookie\nsophomore\n\n\n\nWillie\nMcCovey\n1959\n0.354\n0.238\n\n\nIchiro\nSuzuki\n2001\n0.350\n0.321\n\n\nAl\nBumbry\n1973\n0.337\n0.233\n\n\nFred\nLynn\n1975\n0.331\n0.314\n\n\nAlbert\nPujols\n2001\n0.329\n0.314\n\n\n\n\n\nIn fact, the proportion of players that have a lower batting average their sophomore year is 0.6981132.\nSo is it “jitters” or “jinx”? To answer this question, let’s turn our attention to all players that played the 2013 and 2014 seasons and batted more than 130 times (minimum to win Rookie of the Year).\nThe same pattern arises when we look at the top performers: batting averages go down for most of the top performers.\n\n\n\n\nnameFirst\nnameLast\n2013\n2014\n\n\n\nMiguel\nCabrera\n0.348\n0.313\n\n\nHanley\nRamirez\n0.345\n0.283\n\n\nMichael\nCuddyer\n0.331\n0.332\n\n\nScooter\nGennett\n0.324\n0.289\n\n\nJoe\nMauer\n0.324\n0.277\n\n\n\n\n\nBut these are not rookies! Also, look at what happens to the worst performers of 2013:\n\n\n\n\nnameFirst\nnameLast\n2013\n2014\n\n\n\nDanny\nEspinosa\n0.158\n0.219\n\n\nDan\nUggla\n0.179\n0.149\n\n\nJeff\nMathis\n0.181\n0.200\n\n\nB. J.\nUpton\n0.184\n0.208\n\n\nAdam\nRosales\n0.190\n0.262\n\n\n\n\n\nTheir batting averages mostly go up! Is this some sort of reverse sophomore slump? It is not. There is no such thing as the sophomore slump. This is all explained with a simple statistical fact: the correlation for performance in two separate years is high, but not perfect:\n\n\n\n\n\n\n\n\nThe correlation is 0.460254 and the data look very much like a bivariate normal distribution, which means we predict a 2014 batting average \\(Y\\) for any given player that had a 2013 batting average \\(X\\) with:\n\\[ \\frac{Y - .255}{.032} = 0.46 \\left( \\frac{X - .261}{.023}\\right) \\]\nBecause the correlation is not perfect, regression tells us that, on average, expect high performers from 2013 to do a bit worse in 2014. It’s not a jinx; it’s just due to chance. The ROY are selected from the top values of \\(X\\) so it is expected that \\(Y\\) will regress to the mean."
+    "title": "14  Regression",
+    "section": "\n14.15 The regression fallacy",
+    "text": "14.15 The regression fallacy\nWikipedia defines the sophomore slump as:\n\nA sophomore slump or sophomore jinx or sophomore jitters refers to an instance in which a second, or sophomore, effort fails to live up to the standards of the first effort. It is commonly used to refer to the apathy of students (second year of high school, college or university), the performance of athletes (second season of play), singers/bands (second album), television shows (second seasons) and movies (sequels/prequels).\n\nIn Major League Baseball, the rookie of the year (ROY) award is given to the first-year player who is judged to have performed the best. The sophomore slump phrase is used to describe the observation that ROY award winners don’t do as well during their second year. For example, this Fox Sports article2 asks “Will MLB’s tremendous rookie class of 2015 suffer a sophomore slump?”\nDoes the data confirm the existence of a sophomore slump? Let’s take a look. Examining the data for a widely used measure of success, the batting average, we see that this observation holds true for the top performing ROYs:\n\n\n\n\nnameFirst\nnameLast\nrookie_year\nrookie\nsophomore\n\n\n\nWillie\nMcCovey\n1959\n0.354\n0.238\n\n\nIchiro\nSuzuki\n2001\n0.350\n0.321\n\n\nAl\nBumbry\n1973\n0.337\n0.233\n\n\nFred\nLynn\n1975\n0.331\n0.314\n\n\nAlbert\nPujols\n2001\n0.329\n0.314\n\n\n\n\n\nIn fact, the proportion of players that have a lower batting average during their sophomore year is 0.6981132.\nSo is it “jitters” or “jinx”? To answer this question, let’s turn our attention to all the players that played the 2013 and 2014 seasons and batted more than 130 times (minimum to win Rookie of the Year).\nThe same pattern arises when we look at the top performers: batting averages go down for most of the top performers.\n\n\n\n\nnameFirst\nnameLast\n2013\n2014\n\n\n\nMiguel\nCabrera\n0.348\n0.313\n\n\nHanley\nRamirez\n0.345\n0.283\n\n\nMichael\nCuddyer\n0.331\n0.332\n\n\nScooter\nGennett\n0.324\n0.289\n\n\nJoe\nMauer\n0.324\n0.277\n\n\n\n\n\nBut these are not rookies! Also, look at what happens to the worst performers of 2013:\n\n\n\n\nnameFirst\nnameLast\n2013\n2014\n\n\n\nDanny\nEspinosa\n0.158\n0.219\n\n\nDan\nUggla\n0.179\n0.149\n\n\nJeff\nMathis\n0.181\n0.200\n\n\nB. J.\nUpton\n0.184\n0.208\n\n\nAdam\nRosales\n0.190\n0.262\n\n\n\n\n\nTheir batting averages mostly go up! Is this some sort of reverse sophomore slump? It is not. There is no such thing as a sophomore slump. This is all explained with a simple statistical fact: the correlation for performance in two separate years is high, but not perfect:\n\n\n\n\n\n\n\n\nThe correlation is 0.460254 and the data look very much like a bivariate normal distribution, which means we predict a 2014 batting average \\(Y\\) for any given player that had a 2013 batting average \\(X\\) with:\n\\[ \\frac{Y - .255}{.032} = 0.46 \\left( \\frac{X - .261}{.023}\\right) \\]\nBecause the correlation is not perfect, regression tells us that, on average, expect high performers from 2013 to do a bit worse in 2014. It’s not a jinx; it’s just due to chance. The ROY are selected from the top values of \\(X\\), so it is expected that \\(Y\\) will regress to the mean."
   },
   {
     "objectID": "linear-models/regression.html#exercises",
     "href": "linear-models/regression.html#exercises",
-    "title": "13  Regression",
-    "section": "\n13.16 Exercises",
-    "text": "13.16 Exercises\n1. Load the GaltonFamilies data from the HistData. The children in each family are listed by gender and then by height. Create a dataset called galton_heights by picking a male and female at random.\n2. Make a scatterplot for heights between mothers and daughters, mothers and sons, fathers and daughters, and fathers and sons.\n3. Compute the correlation in heights between mothers and daughters, mothers and sons, fathers and daughters, and fathers and sons."
+    "title": "14  Regression",
+    "section": "\n14.16 Exercises",
+    "text": "14.16 Exercises\n1. Load the GaltonFamilies data from the HistData. The children in each family are listed by gender and then by height. Create a dataset called galton_heights by picking a male and female at random.\n2. Make a scatterplot for heights between mothers and daughters, mothers and sons, fathers and daughters, and fathers and sons.\n3. Compute the correlation in heights between mothers and daughters, mothers and sons, fathers and daughters, and fathers and sons."
   },
   {
     "objectID": "linear-models/regression.html#footnotes",
     "href": "linear-models/regression.html#footnotes",
-    "title": "13  Regression",
+    "title": "14  Regression",
     "section": "",
-    "text": "https://en.wikipedia.org/wiki/Francis_Galton↩︎\nhttp://www.foxsports.com/mlb/story/kris-bryant-carlos-correa-rookies-of-year-award-matt-duffy-francisco-lindor-kang-sano-120715↩︎"
+    "text": "https://en.wikipedia.org/wiki/Francis_Galton↩︎\nhttps://web.archive.org/web20160815063904/http://www.foxsports.com/mlb/story/kris-bryant-carlos-correa-rookies-of-year-award-matt-duffy-francisco-lindor-kang-sano-120715↩︎"
   },
   {
     "objectID": "linear-models/multivariate-regression.html#case-study-moneyball",
     "href": "linear-models/multivariate-regression.html#case-study-moneyball",
-    "title": "14  Multivariate Regression",
-    "section": "\n14.1 Case study: Moneyball",
-    "text": "14.1 Case study: Moneyball\nMoneyball: The Art of Winning an Unfair Game is a book by Michael Lewis about the Oakland Athletics (A’s) baseball team and its general manager, the person tasked with building the team, Billy Beane.\nTraditionally, baseball teams use scouts to help them decide what players to hire. These scouts evaluate players by observing them perform. Scouts tend to favor athletic players with observable physical abilities. For this reason, scouts tend to agree on who the best players are and, as a result, these players tend to be in high demand. This in turn drives up their salaries.\nFrom 1989 to 1991, the A’s had one of the highest payrolls in baseball. They were able to buy the best players and, during that time, they were one of the best teams. However, in 1995 the A’s team owner changed and the new management cut the budget drastically, leaving then general manager, Sandy Alderson, with one of the lowest payrolls in baseball. He could no longer afford the most sought-after players. Alderson began using a statistical approach to find inefficiencies in the market. Alderson was a mentor to Billy Beane, who succeeded him in 1998 and fully embraced data science, as opposed to scouts, as a method for finding low-cost players that data predicted would help the team win. Today, this strategy has been adapted by most baseball teams. As we will see, regression plays a large role in this approach.\nAs motivation for this part of the book, we will pretend it is 2002 and try to build a baseball team with a limited budget, just like the A’s had to do. To appreciate what you are up against, note that in 2002 the Yankees’ payroll of $125,928,583 more than tripled the Oakland A’s $39,679,746:\n\n\n\n\n\n\n\n\nStatistics have been used in baseball since its beginnings. The dataset we will be using, included in the Lahman library, goes back to the 19th century. For example, a summary statistics we will describe soon, the batting average, has been used for decades to summarize a batter’s success. Other statistics1 such as home runs (HR), runs batted in (RBI), and stolen bases (SB) are reported for each player in the game summaries included in the sports section of newspapers, with players rewarded for high numbers. Although summary statistics such as these were widely used in baseball, data analysis per se was not. These statistics were arbitrarily decided on without much thought as to whether they actually predicted anything or were related to helping a team win.\nThis changed with Bill James2. In the late 1970s, this aspiring writer and baseball fan started publishing articles describing more in-depth analysis of baseball data. He named the approach of using data to predict what outcomes best predicted if a team would win sabermetrics3. Until Billy Beane made sabermetrics the center of his baseball operation, Bill James’ work was mostly ignored by the baseball world. Currently, sabermetrics popularity is no longer limited to just baseball; other sports have started to use this approach as well.\nTo simplify the exercise, we will focus on scoring runs and ignore the two other important aspects of the game: pitching and fielding. We will see how regression analysis can help develop strategies to build a competitive baseball team with a constrained budget. The approach can be divided into two separate data analyses. In the first, we determine which recorded player-specific statistics predict runs. In the second, we examine if players were undervalued based on what our first analysis predicts.\n\n14.1.1 Baseball basics\nTo see how regression will help us find undervalued players, we actually don’t need to understand all the details about the game of baseball, which has over 100 rules. Here, we distill the sport to the basic knowledge one needs to know how to effectively attack the data science problem.\nThe goal of a baseball game is to score more runs (points) than the other team. Each team has 9 batters that have an opportunity to hit a ball with a bat in a predetermined order. After the 9th batter has had their turn, the first batter bats again, then the second, and so on. Each time a batter has an opportunity to bat, we call it a plate appearance (PA). At each PA, the other team’s pitcher throws the ball and the batter tries to hit it. The PA ends with an binary outcome: the batter either makes an out (failure) and returns to the bench or the batter doesn’t (success) and can run around the bases, and potentially score a run (reach all 4 bases). Each team gets nine tries, referred to as innings, to score runs and each inning ends after three outs (three failures).\nHere is a video showing a success: https://www.youtube.com/watch?v=HL-XjMCPfio. And here is one showing a failure: https://www.youtube.com/watch?v=NeloljCx-1g. In these videos, we see how luck is involved in the process. When at bat, the batter wants to hit the ball hard. If the batter hits it hard enough, it is a HR, the best possible outcome as the batter gets at least one automatic run. But sometimes, due to chance, the batter hits the ball very hard and a defender catches it, resulting in an out. In contrast, sometimes the batter hits the ball softly, but it lands just in the right place. The fact that there is chance involved hints at why probability models will be involved.\nNow, there are several ways to succeed. Understanding this distinction will be important for our analysis. When the batter hits the ball, the batter wants to pass as many bases as possible. There are four bases with the fourth one called home plate. Home plate is where batters start by trying to hit, so the bases form a cycle.\n\n\n\n\n\n\n\n\n(Courtesy of Cburnett4. CC BY-SA 3.0 license5.) \nA batter who goes around the bases and arrives home, scores a run.\nWe are simplifying a bit, but there are five ways a batter can succeed, that is, not make an out:\n\nBases on balls (BB) - the pitcher fails to throw the ball through a predefined area considered to be hittable (the strikezone), so the batter is permitted to go to first base.\nSingle - Batter hits the ball and gets to first base.\nDouble (2B) - Batter hits the ball and gets to second base.\nTriple (3B) - Batter hits the ball and gets to third base.\nHome Run (HR) - Batter hits the ball and goes all the way home and scores a run.\n\nHere is an example of a HR: https://www.youtube.com/watch?v=xYxSZJ9GZ-w. If a batter gets to a base, the batter still has a chance of getting home and scoring a run if the next batter hits successfully. While the batter is on base, the batter can also try to steal a base (SB). If a batter runs fast enough, the batter can try to go from one base to the next without the other team tagging the runner. Here is an example of a stolen base: https://www.youtube.com/watch?v=JSE5kfxkzfk.\nAll these events are kept track of during the season and are available to us through the Lahman package. Now we will start discussing how data analysis can help us decide how to use these statistics to evaluate players.\n\n14.1.2 No awards for BB\nHistorically, the batting average has been considered the most important offensive statistic. To define this average, we define a hit (H) and an at bat (AB). Singles, doubles, triples, and home runs are hits. The fifth way to be successful, BB, is not a hit. An AB is the number of times you either get a hit or make an out; BBs are excluded. The batting average is simply H/AB and is considered the main measure of a success rate. Today this success rate ranges from 20% to 38%. We refer to the batting average in thousands so, for example, if your success rate is 28%, we call it batting 280.\n\n\n\n\n\n\n\n\n(Picture courtesy of Keith Allison6. CC BY-SA 2.0 license7.)\nOne of Bill James’ first important insights is that the batting average ignores BB, but a BB is a success. He proposed we use the on base percentage (OBP) instead of batting average. He defined OBP as (H+BB)/(AB+BB) which is simply the proportion of plate appearances that don’t result in an out, a very intuitive measure. He noted that a player that gets many more BB than the average player might not be recognized if the batter does not excel in batting average. But is this player not helping produce runs? No award is given to the player with the most BB. However, bad habits are hard to break and baseball did not immediately adopt OBP as an important statistic. In contrast, total stolen bases were considered important and an award8 given to the player with the most. But players with high totals of SB also made more outs as they did not always succeed. Does a player with high SB total help produce runs? Can we use data science to determine if it’s better to pay for players with high BB or SB?\n\n14.1.3 Base on balls or stolen bases?\nOne of the challenges in this analysis is that it is not obvious how to determine if a player produces runs because so much depends on his teammates. We do keep track of the number of runs scored by a player. However, remember that if a player X bats right before someone who hits many HRs, batter X will score many runs. But these runs don’t necessarily happen if we hire player X but not his HR hitting teammate. However, we can examine team-level statistics. How do teams with many SB compare to teams with few? How about BB? We have data! Let’s examine some. We start by creating with statistics from 1962, the first year all teams played 162 games (like today) instead of 154, to 2001, the year before the year for which we will construct a team. We convert the data to a per game rate because a small proportion of seasons had less games than usual due to strikes, and some teams played extra games due to tie breakers.\n\nlibrary(tidyverse)\n#&gt; ── Attaching core tidyverse packages ──────────────── tidyverse 2.0.0 ──\n#&gt; ✔ dplyr     1.1.1     ✔ readr     2.1.4\n#&gt; ✔ forcats   1.0.0     ✔ stringr   1.5.0\n#&gt; ✔ lubridate 1.9.2     ✔ tibble    3.2.1\n#&gt; ✔ purrr     1.0.1     ✔ tidyr     1.3.0\n#&gt; ── Conflicts ────────────────────────────────── tidyverse_conflicts() ──\n#&gt; ✖ dplyr::filter() masks stats::filter()\n#&gt; ✖ dplyr::lag()    masks stats::lag()\n#&gt; ℹ Use the conflicted package (&lt;http://conflicted.r-lib.org/&gt;) to force all conflicts to become errors\nlibrary(Lahman)\ndat &lt;- Teams |&gt; filter(yearID %in% 1962:2002) |&gt;\n  mutate(team = teamID, year = yearID, r = R/G, \n         singles = (H - X2B - X3B - HR)/G, doubles = X2B/G, triples = X3B/G, hr = HR/G,\n         sb = SB/G, bb = BB/G) |&gt;\n  select(team, year, r, singles, doubles, triples, hr, sb, bb)\n\nNow let’s start with a obvious question: does teams that hit more home runs score more runs? The visualization of choice when exploring the relationship between two variables is a scatter plot.\n\np &lt;- dat |&gt; ggplot(aes(hr, r)) + geom_point(alpha = 0.5)\np \n\n\n\n\n\n\n\nWe defined p because we will add to this plot latter. The plot shows a strong association: teams with more HRs tend to score more runs. Now let’s examine the relationship between stolen bases and runs:\n\ndat |&gt; ggplot(aes(sb, r)) + geom_point(alpha = 0.5)\n\n\n\n\n\n\n\nHere the relationship is not as clear. Finally, let’s examine the relationship between BB and runs:\n\ndat |&gt; ggplot(aes(bb, r)) + geom_point(alpha = 0.5)\n\n\n\n\n\n\n\nHere again we see a clear association. But does this mean that increasing a team’s BBs causes an increase in runs? One of the most important lessons you learn in this book is that association is not causation. In fact, it looks like BBs and HRs are also associated:\n\ndat |&gt; ggplot(aes(hr, bb)) + geom_point(alpha = 0.5)\n\n\n\n\n\n\n\nWe know that HRs cause runs because when a player hits a HR they are guaranteed at least one run. Could it be that HRs also cause BB and this makes it appear as if BB cause runs? When this happens we say there is confounding, an important concept we will learn more about throughout this chapter.\nLinear regression will help us parse all this out and quantify the associations. This will then help us determine what players to recruit. Specifically, we will try to predict things like how many more runs will a team score if we increase the number of BBs, but keep the HRs fixed? Regression will help us answer questions like this one.\n\n14.1.4 Regression applied to baseball statistics\nCan we use regression with these data? First, notice that the HR and Run data, shown above, appear to be bivariate normal. Specifically, the qq-plots confirm that the normal approximation for each HR strata is useful here:\n\ndat |&gt; mutate(z_hr = round(scale(hr))) |&gt;\n  filter(z_hr %in% -2:3) |&gt;\n  ggplot() +  \n  stat_qq(aes(sample = r)) +\n  facet_wrap(~z_hr) \n\n\n\n\n\n\n\nNow we are ready to use linear regression to predict the number of runs a team will score if we know how many home runs the team hits using regression:\n\nhr_fit  &lt;- lm(r ~ hr, data = dat)$coef\np + geom_abline(intercept = hr_fit[[1]], slope = hr_fit[[2]])\n\n\n\n\n\n\n\nNote that we can obtain the same plot quicker by using the ggplot2 function geom_smooth which computes and adds a regression line to plot along with confidence intervals. We use the argument method = \"lm\" which stands for linear model, the title of an upcoming section. So we can simplify the code above like this:\n\np + geom_smooth(method = \"lm\")\n\n\n\n\n\n\n\nIn the example above, the slope is 1.8517449. So this tells us that teams that hit 1 more HR per game than the average team, score 1.8517449 more runs per game than the average team. Given that the most common final score is a difference of a run, this can certainly lead to a large increase in wins. Not surprisingly, HR hitters are very expensive. Because we are working on a budget, we will need to find some other way to increase wins. In the next chapter, we introduce linear models, which provide an framework for performing this analysis. In chapter @ref{@moneyball} we apply what have learned to build a baseball team."
+    "title": "15  Multivariate Regression",
+    "section": "\n15.1 Case study: Moneyball",
+    "text": "15.1 Case study: Moneyball\nMoneyball: The Art of Winning an Unfair Game by Michael Lewis focuses on the Oakland Athletics (A’s) baseball team and its general manager, Billy Beane, the person tasked with building the team.\nTraditionally, baseball teams use scouts to help them decide what players to hire. These scouts evaluate players by observing them perform, tending to favor athletic players with observable physical abilities. For this reason, scouts generally agree on who the best players are and, as a result, these players are often in high demand. This in turn drives up their salaries.\nFrom 1989 to 1991, the A’s had one of the highest payrolls in baseball. They were able to buy the best players and, during that time, were one of the best teams. However, in 1995, the A’s team owner changed and the new management cut the budget drastically, leaving then general manager, Sandy Alderson, with one of the lowest payrolls in baseball. He could no longer afford the most sought-after players. As a result, Alderson began using a statistical approach to find inefficiencies in the market. Alderson was a mentor to Billy Beane, who succeeded him in 1998 and fully embraced data science, as opposed to scouts, as a method for finding low-cost players that data predicted would help the team win. Today, this strategy has been adapted by most baseball teams. As we will see, regression plays a large role in this approach.\nAs motivation for this part of the book, we will pretend it is 2002 and try to build a baseball team with a limited budget, just like the A’s had to do. To appreciate what you are up against, note that in 2002 the Yankees’ payroll of $125,928,583 more than tripled the Oakland A’s $39,679,746:\n\n\n\n\n\n\n\n\nStatistics have been used in baseball since its beginnings. The dataset we will be using, included in the Lahman library, goes back to the 19th century. For example, a summary statistics we will describe soon, the batting average, has been used for decades to summarize a batter’s success. Other statistics1 such as home runs (HR), runs batted in (RBI), and stolen bases (SB) are reported for each player in the game summaries included in the sports section of newspapers, with players rewarded for high numbers. Although summary statistics such as these were widely used in baseball, data analysis per se was not. These statistics were arbitrarily decided on without much thought as to whether they actually predicted anything or were related to helping a team win.\nThis changed with Bill James2. In the late 1970s, this aspiring writer and baseball fan started publishing articles describing more in-depth analysis of baseball data. He named the approach of using data to predict what outcomes best predicted if a team would win sabermetrics3. Yet until Billy Beane made sabermetrics the center of his baseball operation, Bill James’ work was mostly ignored by the baseball world. Currently, sabermetrics popularity is no longer limited to just baseball, with other sports also adopting this approach.\nTo simplify the exercise, we will focus on scoring runs and ignore the two other important aspects of the game: pitching and fielding. We will see how regression analysis can help develop strategies to build a competitive baseball team with a constrained budget. The approach can be divided into two separate data analyses. In the first, we determine which recorded player-specific statistics predict runs. In the second, we examine if players were undervalued based on the predictions from our first analysis.\n\n15.1.1 Baseball basics\nTo see how regression will help us find undervalued players, we actually don’t need to understand all the details about the game of baseball, which has over 100 rules. Here, we distill the sport to the basic knowledge one needs to know how to effectively attack the data science problem.\nThe goal of a baseball game is to score more runs (points) than the other team. Each team has 9 batters that have an opportunity to hit a ball with a bat in a predetermined order. After the 9th batter has had their turn, the first batter bats again, then the second, and so on. Each time a batter has an opportunity to bat, we call it a plate appearance (PA). At each PA, the other team’s pitcher throws the ball and the batter tries to hit it. The PA ends with an binary outcome: the batter either makes an out (failure) and returns to the bench, or the batter doesn’t (success) and can run around the bases, potentially scoring a run (reaching all 4 bases). Each team gets nine tries, referred to as innings, to score runs, and each inning ends after three outs (three failures).\nHere is a video showing a success: https://www.youtube.com/watch?v=HL-XjMCPfio. And here is one showing a failure: https://www.youtube.com/watch?v=NeloljCx-1g. In these videos, we see how luck is involved in the process. When at bat, the batter wants to hit the ball hard. If the batter hits it hard enough, it is a HR, the best possible outcome as the batter gets at least one automatic run. But sometimes, due to chance, the batter hits the ball very hard and a defender catches it, resulting in an out. In contrast, sometimes the batter hits the ball softly, but it lands just in the right place. The fact that there is chance involved hints at why probability models will be involved.\nNow, there are several ways to succeed. Understanding this distinction will be important for our analysis. When the batter hits the ball, the batter wants to pass as many bases as possible. There are four bases, with the fourth one called home plate. Home plate is where batters start by trying to hit, so the bases form a cycle.\n\n\n\n\n\n\n\n\n(Courtesy of Cburnett4. CC BY-SA 3.0 license5.) \nA batter who goes around the bases and arrives home, scores a run.\nWe are simplifying a bit, but there are five ways a batter can succeed, that is, not make an out:\n\nBases on balls (BB) - the pitcher fails to throw the ball through a predefined area considered to be hittable (the strike zone), so the batter is permitted to go to first base.\nSingle - Batter hits the ball and gets to first base.\nDouble (2B) - Batter hits the ball and gets to second base.\nTriple (3B) - Batter hits the ball and gets to third base.\nHome Run (HR) - Batter hits the ball and goes all the way home and scores a run.\n\nHere is an example of a HR: https://www.youtube.com/watch?v=xYxSZJ9GZ-w. If a batter reaches a base, the batter still has a chance of reaching home and scoring a run if the next batter succeeds with a hit. While the batter is on base, the batter can also try to steal a base (SB). If a batter runs fast enough, the batter can try to advance from one base to the next without the other team tagging the runner. Here is an example of a stolen base: https://www.youtube.com/watch?v=JSE5kfxkzfk.\nAll these events are tracked throughout the season and are available to us through the Lahman package. Now we will start discussing how data analysis can help us decide how to use these statistics to evaluate players.\n\n15.1.2 No awards for BB\nHistorically, the batting average has been considered the most important offensive statistic. To define this average, we define a hit (H) and an at bat (AB). Singles, doubles, triples, and home runs are hits. The fifth way to be successful, BB, is not a hit. An AB is the number of times in which you either get a hit or make an out; BBs are excluded. The batting average is simply H/AB and is considered the main measure of a success rate. Today, this success rate ranges from 20% to 38%. We refer to the batting average in thousands so, for example, if your success rate is 28%, we call it batting 280.\n\n\n\n\n\n\n\n\n(Picture courtesy of Keith Allison6. CC BY-SA 2.0 license7.)\nOne of Bill James’ first important insights is that the batting average ignores BB, but a BB is a success. Instead of batting average, James proposed the use of the on base percentage (OBP), which he defined as (H+BB)/(AB+BB), or simply the proportion of plate appearances that don’t result in an out, a very intuitive measure. He noted that a player that accumulates many more BB than the average player might go unrecognized if the batter does not excel in batting average. But is this player not helping produce runs? No award is given to the player with the most BB. However, bad habits are hard to break and baseball did not immediately adopt OBP as an important statistic. In contrast, total stolen bases were considered important and an award8 given to the player with the most. But players with high totals of SB also made more outs as they did not always succeed. Does a player with high SB total help produce runs? Can we use data science to determine if it’s better to pay for players with high BB or SB?\n\n15.1.3 Base on balls or stolen bases?\nOne of the challenges in this analysis is that it is not obvious how to determine if a player produces runs because so much depends on his teammates. Although we keep track of the number of runs scored by a player, remember that if player X bats right before someone who hits many HRs, batter X will score many runs. Note these runs don’t necessarily happen if we hire player X, but not his HR hitting teammate.\nHowever, we can examine team-level statistics. How do teams with many SB compare to teams with few? How about BB? We have data! Let’s examine some. We start by creating a data frame with statistics from 1962 (the first year all teams played 162 games, like today, instead of 154) to 2001 (the year before the year for which we will construct a team). We convert the data to a per game rate, because a small proportion of seasons had less games than usual due to strikes, and some teams played extra games due to tie breakers.\n\nlibrary(tidyverse)\n#&gt; ── Attaching core tidyverse packages ──────────────── tidyverse 2.0.0 ──\n#&gt; ✔ dplyr     1.1.1     ✔ readr     2.1.4\n#&gt; ✔ forcats   1.0.0     ✔ stringr   1.5.0\n#&gt; ✔ lubridate 1.9.2     ✔ tibble    3.2.1\n#&gt; ✔ purrr     1.0.1     ✔ tidyr     1.3.0\n#&gt; ── Conflicts ────────────────────────────────── tidyverse_conflicts() ──\n#&gt; ✖ dplyr::filter() masks stats::filter()\n#&gt; ✖ dplyr::lag()    masks stats::lag()\n#&gt; ℹ Use the conflicted package (&lt;http://conflicted.r-lib.org/&gt;) to force all conflicts to become errors\nlibrary(Lahman)\ndat &lt;- Teams |&gt; filter(yearID %in% 1962:2002) |&gt;\n  mutate(team = teamID, year = yearID, r = R/G, \n         singles = (H - X2B - X3B - HR)/G, \n         doubles = X2B/G, triples = X3B/G, hr = HR/G,\n         sb = SB/G, bb = BB/G) |&gt;\n  select(team, year, r, singles, doubles, triples, hr, sb, bb)\n\nNow let’s start with a obvious question: do teams that hit more home runs score more runs? When exploring the relationship between two variables, The visualization of choice is a scatterplot:\n\np &lt;- dat |&gt; ggplot(aes(hr, r)) + geom_point(alpha = 0.5)\np \n\n\n\n\n\n\n\nWe defined p because we will add it to this plot latter. The plot shows a strong association: teams with more HRs tend to score more runs. Now let’s examine the relationship between stolen bases and runs:\n\ndat |&gt; ggplot(aes(sb, r)) + geom_point(alpha = 0.5)\n\n\n\n\n\n\n\nHere the relationship is not as clear. Finally, let’s examine the relationship between BB and runs:\n\ndat |&gt; ggplot(aes(bb, r)) + geom_point(alpha = 0.5)\n\n\n\n\n\n\n\nHere again we see a clear association. But does this mean that increasing a team’s BBs causes an increase in runs? One of the most important lessons you learn in this book is that association is not causation. In fact, it looks like BBs and HRs are also associated:\n\ndat |&gt; ggplot(aes(hr, bb)) + geom_point(alpha = 0.5)\n\n\n\n\n\n\n\nWe know that HRs cause runs because when a player hits a HR, they are guaranteed at least one run. Could it be that HRs also cause BB and this makes it appear as if BB cause runs? When this happens, we say there is confounding, an important concept we will learn more about throughout this section.\nLinear regression will help us parse out the information and quantify the associations. This, in turn, will aid us in determining what players to recruit. Specifically, we will try to predict things like how many more runs will a team score if we increase the number of BBs, but keep the HRs fixed? Regression will help us answer questions like this one.\n\n15.1.4 Regression applied to baseball statistics\nCan we use regression with these data? First, notice that the HR and Run data, shown above, appear to be bivariate normal. Specifically, the qq-plots confirm that the normal approximation for each HR strata is useful here:\n\ndat |&gt; mutate(z_hr = round(scale(hr))) |&gt;\n  filter(z_hr %in% -2:3) |&gt;\n  ggplot() +  \n  stat_qq(aes(sample = r)) +\n  facet_wrap(~z_hr) \n\n\n\n\n\n\n\nNow we are ready to use linear regression to predict the number of runs a team will score, if we know how many home runs the team hits using regression:\n\nhr_fit  &lt;- lm(r ~ hr, data = dat)$coef\np + geom_abline(intercept = hr_fit[[1]], slope = hr_fit[[2]])\n\n\n\n\n\n\n\nNote that we can obtain the same plot more quickly by using the ggplot2 function geom_smooth, which computes and adds a regression line to plot along with confidence intervals. We use the argument method = \"lm\", which stands for linear model, the title of an upcoming section. We simplify the code above like this:\n\np + geom_smooth(method = \"lm\")\n\n\n\n\n\n\n\nIn the example above, the slope is 1.8517449. This tells us that teams that hit 1 more HR per game than the average team, score 1.8517449 more runs per game than the average team. Given that the most common final score is a difference of one run, this can certainly lead to a large increase in wins. Not surprisingly, HR hitters are very expensive. Because we are working on a budget, we will need to find some other way to increase wins. In the next chapter, we introduce linear models, which provide an framework for performing this analysis. In @ref{@moneyball}, we apply what have learned to build a baseball team."
   },
   {
     "objectID": "linear-models/multivariate-regression.html#the-broom-package",
     "href": "linear-models/multivariate-regression.html#the-broom-package",
-    "title": "14  Multivariate Regression",
-    "section": "\n14.2 The broom package",
-    "text": "14.2 The broom package\nThe broom package facilitates the use of R function such as lm within the tidyverse. Recall the that lm does not take a data frame as a first argument and does not return a data frame, which makes using lm in conjunction with the tidyverse difficult. It has three main functions, all of which extract information from the object returned by lm and returns it in a tidyverse friendly data frame. These functions are tidy, glance, and augment. The tidy function returns estimates and related information as a data frame:\n\nlibrary(broom)\nfit &lt;- lm(r ~ bb, data = dat)\ntidy(fit)\n#&gt; # A tibble: 2 × 5\n#&gt;   term        estimate std.error statistic  p.value\n#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;\n#&gt; 1 (Intercept)    1.93     0.116       16.7 1.91e-55\n#&gt; 2 bb             0.739    0.0348      21.2 1.90e-83\n\nWe can add other important summaries, such as confidence intervals:\n\ntidy(fit, conf.int = TRUE)\n#&gt; # A tibble: 2 × 7\n#&gt;   term        estimate std.error statistic  p.value conf.low conf.high\n#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;\n#&gt; 1 (Intercept)    1.93     0.116       16.7 1.91e-55    1.70      2.15 \n#&gt; 2 bb             0.739    0.0348      21.2 1.90e-83    0.671     0.807\n\nBecause the outcome is a data frame, we can immediately use it with summarize to string together the commands that produce the table we are after. Because a data frame is returned, we can filter and select the rows and columns we want, as we will see in the next section.\nNow we return to discussing our original task of determining if slopes changed. The plot we just made, using summarize and tidy, shows that the confidence intervals overlap, which provides a nice visual confirmation that our assumption that the slope does not change is safe.\nThe other functions provided by broom, glance, and augment, relate to model-specific and observation-specific outcomes, respectively. Here, we can see the model fit summaries glance returns:\n\nglance(fit)\n#&gt; # A tibble: 1 × 12\n#&gt;   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC\n#&gt;       &lt;dbl&gt;         &lt;dbl&gt; &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt; &lt;dbl&gt;  &lt;dbl&gt; &lt;dbl&gt;\n#&gt; 1     0.304         0.303 0.493      451. 1.90e-83     1  -737. 1480.\n#&gt; # ℹ 4 more variables: BIC &lt;dbl&gt;, deviance &lt;dbl&gt;, df.residual &lt;int&gt;,\n#&gt; #   nobs &lt;int&gt;\n\nYou can learn more about these summaries in any regression text book."
+    "title": "15  Multivariate Regression",
+    "section": "\n15.2 The broom package",
+    "text": "15.2 The broom package\nThe broom package facilitates the use of R function, such as lm, within the tidyverse. Recall the that lm does not take a data frame as a first argument and does not return a data frame, which makes using lm in conjunction with the tidyverse difficult. It has three main functions, all of which extract information from the object returned by lm and returns it in a tidyverse friendly data frame. These functions are: tidy, glance, and augment. The tidy function returns estimates and related information as a data frame:\n\nlibrary(broom)\nfit &lt;- lm(r ~ bb, data = dat)\ntidy(fit)\n#&gt; # A tibble: 2 × 5\n#&gt;   term        estimate std.error statistic  p.value\n#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;\n#&gt; 1 (Intercept)    1.93     0.116       16.7 1.91e-55\n#&gt; 2 bb             0.739    0.0348      21.2 1.90e-83\n\nWe can add other important summaries, such as confidence intervals:\n\ntidy(fit, conf.int = TRUE)\n#&gt; # A tibble: 2 × 7\n#&gt;   term        estimate std.error statistic  p.value conf.low conf.high\n#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;\n#&gt; 1 (Intercept)    1.93     0.116       16.7 1.91e-55    1.70      2.15 \n#&gt; 2 bb             0.739    0.0348      21.2 1.90e-83    0.671     0.807\n\nGiven that the outcome is a data frame, we can immediately use it with summarize to string together the commands that produce the table we are after. Because a data frame is returned, we can filter and select the rows and columns we want, as we will see in the next section.\nNow we return to discussing our original task of determining if slopes changed. The plot we just made, using summarize and tidy, shows that the confidence intervals overlap, which provides a nice visual confirmation that our assumption that the slope does not change is safe.\nThe other functions provided by broom, glance and augment, relate to model-specific and observation-specific outcomes, respectively."
   },
   {
     "objectID": "linear-models/multivariate-regression.html#confounding",
     "href": "linear-models/multivariate-regression.html#confounding",
-    "title": "14  Multivariate Regression",
-    "section": "\n14.3 Confounding",
-    "text": "14.3 Confounding\nPreviously, we noted a strong relationship between Runs and BB. If we find the regression line for predicting runs from bases on balls, we a get slope of:\n\nbb_slope &lt;- lm(r ~ bb, data = dat)$coef[2]\nbb_slope \n#&gt;    bb \n#&gt; 0.739\n\nSo does this mean that if we go and hire low salary players with many BB, and who therefore increase the number of walks per game by 2, our team will score 1.5 more runs per game?\nWe are again reminded that association is not causation. The data does provide strong evidence that a team with two more BB per game than the average team, scores 1.5 runs per game. But this does not mean that BB are the cause.\nNote that if we compute the regression line slope for singles we get:\n\nlm(r ~ singles, data = dat)$coef[2]\n#&gt; singles \n#&gt;   0.432\n\nwhich is a lower value than what we obtain for BB. Notice that a single gets you to first base just like a BB. Those that know about baseball will tell you that with a single, runners on base have a better chance of scoring than with a BB. So how can BB be more predictive of runs? The reason this happen is because of confounding. Here we show the correlation between HR, BB, and singles:\n\ndat |&gt; summarize(cor(bb, hr), cor(singles, hr), cor(bb, singles))\n#&gt;   cor(bb, hr) cor(singles, hr) cor(bb, singles)\n#&gt; 1       0.406           -0.186          -0.0513\n\nIt turns out that pitchers, afraid of HRs, will sometimes avoid throwing strikes to HR hitters. As a result, HR hitters tend to have more BBs and a team with many HRs will also have more BBs. Although it may appear that BBs cause runs, it is actually the HRs that cause most of these runs. We say that BBs are confounded with HRs. Nonetheless, could it be that BBs still help? To find out, we somehow have to adjust for the HR effect. Regression can help with this as well.\n\n14.3.1 Understanding confounding through stratification\nA first approach is to keep HRs fixed at a certain value and then examine the relationship between BB and runs. As we did when we stratified fathers by rounding to the closest inch, here we can stratify HR per game to the closest ten. We filter out the strata with few points to avoid highly variable estimates and then make a scatterplot for each strata:\n\ndat |&gt; mutate(hr_strata = round(hr, 1)) |&gt; \n  filter(hr_strata &gt;= 0.4 & hr_strata &lt;= 1.2) |&gt;\n  ggplot(aes(bb, r)) +  \n  geom_point(alpha = 0.5) +\n  geom_smooth(method = \"lm\") +\n  facet_wrap(~hr_strata) \n\n\n\n\n\n\n\nRemember that the regression slope for predicting runs with BB was 0.7. Once we stratify by HR, these slopes are substantially reduced:\n\ndat |&gt; mutate(hr_strata = round(hr, 1)) |&gt; \n  filter(hr_strata &gt;= 0.5 & hr_strata &lt;= 1.2) |&gt;  \n  group_by(hr_strata) |&gt;\n  reframe(tidy(lm(r ~ bb))) |&gt;\n  filter(term == \"bb\")\n#&gt; # A tibble: 8 × 6\n#&gt;   hr_strata term  estimate std.error statistic      p.value\n#&gt;       &lt;dbl&gt; &lt;chr&gt;    &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;        &lt;dbl&gt;\n#&gt; 1       0.5 bb       0.566    0.110       5.14 0.00000302  \n#&gt; 2       0.6 bb       0.405    0.0984      4.12 0.0000746   \n#&gt; 3       0.7 bb       0.284    0.0717      3.96 0.000113    \n#&gt; 4       0.8 bb       0.378    0.0638      5.92 0.0000000175\n#&gt; 5       0.9 bb       0.254    0.0762      3.33 0.00108     \n#&gt; # ℹ 3 more rows\n\n\n\n\n\n\n\nNote we use reframe instead of summarize because tidy returns a data frame with two rows.\n\n\n\nThe slopes are reduced, but they are not 0, which indicates that BBs are helpful for producing runs, just not as much as previously thought. In fact, the values above are closer to the slope we obtained from singles, 0.4, which is more consistent with our intuition. Since both singles and BB get us to first base, they should have about the same predictive power.\nAlthough our understanding of the application tells us that HR cause BB but not the other way around, we can still check if stratifying by BB makes the effect of BB go down. To do this, we use the same code except that we swap HR and BBs. In this case, the slopes do not change much from the original:\n\ndat |&gt; mutate(bb_strata = round(bb, 1)) |&gt; \n  filter(bb_strata &gt;= 3 & bb_strata &lt;= 4) |&gt;  \n  group_by(bb_strata) |&gt;\n  reframe(tidy(lm(r ~ hr))) |&gt;\n  filter(term == \"hr\")\n#&gt; # A tibble: 11 × 6\n#&gt;   bb_strata term  estimate std.error statistic  p.value\n#&gt;       &lt;dbl&gt; &lt;chr&gt;    &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;\n#&gt; 1       3   hr        1.51     0.182      8.31 1.47e-12\n#&gt; 2       3.1 hr        1.49     0.168      8.87 3.10e-14\n#&gt; 3       3.2 hr        1.61     0.150     10.8  6.96e-18\n#&gt; 4       3.3 hr        1.57     0.167      9.39 5.73e-15\n#&gt; 5       3.4 hr        1.55     0.153     10.1  3.77e-16\n#&gt; # ℹ 6 more rows\n\nThey are reduced a bit from 1.8517449, which is consistent with the fact that BB do in fact cause some runs.\nRegardless, it seems that if we stratify by HR, we have bivariate distributions for runs versus BB. Similarly, if we stratify by BB, we have approximate bivariate normal distributions for HR versus runs."
+    "title": "15  Multivariate Regression",
+    "section": "\n15.3 Confounding",
+    "text": "15.3 Confounding\nPreviously, we noted a strong relationship between Runs and BB. If we find the regression line for predicting runs from bases on balls, we a get slope of:\n\nbb_slope &lt;- lm(r ~ bb, data = dat)$coef[2]\nbb_slope \n#&gt;    bb \n#&gt; 0.739\n\nDoes this mean that if we go and hire low salary players with many BB, and who therefore increase the number of walks per game by 2, our team will score 1.5 more runs per game?\nWe are again reminded that association is not causation. The data does provide strong evidence that a team with two more BB per game than the average team, scores 1.5 runs per game. But this does not mean that BB are the cause.\nNote that, if we compute the regression line slope for singles, we get:\n\nlm(r ~ singles, data = dat)$coef[2]\n#&gt; singles \n#&gt;   0.432\n\nwhich is a lower value than what we obtain for BB. Remember that a single gets you to first base just like a BB. Baseball fans will point out that with a single, runners on base have a better chance of scoring than with a BB. So how can BB be more predictive of runs? The reason is because of confounding. Here we show the correlation between HR, BB, and singles:\n\ndat |&gt; summarize(cor(bb, hr), cor(singles, hr), cor(bb, singles))\n#&gt;   cor(bb, hr) cor(singles, hr) cor(bb, singles)\n#&gt; 1       0.406           -0.186          -0.0513\n\nIt appears that pitchers, afraid of HRs, will sometimes avoid throwing strikes to HR hitters. As a result, HR hitters tend to have more BBs, and a team with many HRs will also have more BBs. Although it may appear that BBs cause runs, it is actually the HRs that cause most of these runs. We say that BBs are confounded with HRs. Nonetheless, could it be that BBs still help? To find out, we somehow have to adjust for the HR effect. Regression can help with this as well.\n\n15.3.1 Understanding confounding through stratification\nA first approach is to keep HRs fixed at a certain value and then examine the relationship between BB and runs. As we did when we stratified fathers by rounding to the closest inch, here we can stratify HR per game to the closest ten. We filter out the strata with few points to avoid highly variable estimates and then make a scatterplot for each strata:\n\ndat |&gt; mutate(hr_strata = round(hr, 1)) |&gt; \n  filter(hr_strata &gt;= 0.4 & hr_strata &lt;= 1.2) |&gt;\n  ggplot(aes(bb, r)) +  \n  geom_point(alpha = 0.5) +\n  geom_smooth(method = \"lm\") +\n  facet_wrap(~hr_strata) \n\n\n\n\n\n\n\nRemember that the regression slope for predicting runs with BB was 0.7. Once we stratify by HR, these slopes are substantially reduced:\n\ndat |&gt; mutate(hr_strata = round(hr, 1)) |&gt; \n  filter(hr_strata &gt;= 0.5 & hr_strata &lt;= 1.2) |&gt;  \n  group_by(hr_strata) |&gt;\n  reframe(tidy(lm(r ~ bb))) |&gt;\n  filter(term == \"bb\")\n#&gt; # A tibble: 8 × 6\n#&gt;   hr_strata term  estimate std.error statistic      p.value\n#&gt;       &lt;dbl&gt; &lt;chr&gt;    &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;        &lt;dbl&gt;\n#&gt; 1       0.5 bb       0.566    0.110       5.14 0.00000302  \n#&gt; 2       0.6 bb       0.405    0.0984      4.12 0.0000746   \n#&gt; 3       0.7 bb       0.284    0.0717      3.96 0.000113    \n#&gt; 4       0.8 bb       0.378    0.0638      5.92 0.0000000175\n#&gt; 5       0.9 bb       0.254    0.0762      3.33 0.00108     \n#&gt; # ℹ 3 more rows\n\n\n\n\n\n\n\nNote we use reframe instead of summarize because tidy returns a data frame with two rows.\n\n\n\nThe slopes are reduced, but they are not 0, which indicates that BBs are helpful for producing runs, just not as much as previously thought. In fact, the values above are closer to the slope we obtained from singles, 0.4, which is more consistent with our intuition. Since both singles and BB get us to first base, they should have about the same predictive power.\nAlthough our understanding of the application tells us that HR cause BB, but not the other way around, we can still check if stratifying by BB makes the effect of BB go down. To do this, we use the same code except that we swap HR and BBs. In this case, the slopes do not change much from the original:\n\ndat |&gt; mutate(bb_strata = round(bb, 1)) |&gt; \n  filter(bb_strata &gt;= 3 & bb_strata &lt;= 4) |&gt;  \n  group_by(bb_strata) |&gt;\n  reframe(tidy(lm(r ~ hr))) |&gt;\n  filter(term == \"hr\")\n#&gt; # A tibble: 11 × 6\n#&gt;   bb_strata term  estimate std.error statistic  p.value\n#&gt;       &lt;dbl&gt; &lt;chr&gt;    &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;\n#&gt; 1       3   hr        1.51     0.182      8.31 1.47e-12\n#&gt; 2       3.1 hr        1.49     0.168      8.87 3.10e-14\n#&gt; 3       3.2 hr        1.61     0.150     10.8  6.96e-18\n#&gt; 4       3.3 hr        1.57     0.167      9.39 5.73e-15\n#&gt; 5       3.4 hr        1.55     0.153     10.1  3.77e-16\n#&gt; # ℹ 6 more rows\n\nThey are reduced slightly from 1.8517449, which is consistent with the fact that BB do in fact cause some runs.\nRegardless, it seems that if we stratify by HR, we have bivariate distributions for runs versus BB. Similarly, if we stratify by BB, we have approximate bivariate normal distributions for HR versus runs."
   },
   {
     "objectID": "linear-models/multivariate-regression.html#sec-regression-in-r",
     "href": "linear-models/multivariate-regression.html#sec-regression-in-r",
-    "title": "14  Multivariate Regression",
-    "section": "\n14.4 Multivariable regression",
-    "text": "14.4 Multivariable regression\nIt is somewhat complex to be computing regression lines for each strata. We are essentially fitting models like this:\n\\[\n\\mbox{E}[R \\mid BB = x_1, \\, HR = x_2] = \\beta_0 + \\beta_1(x_2) x_1 + \\beta_2(x_1) x_2\n\\]\nwith the slopes for \\(x_1\\) changing for different values of \\(x_2\\) and vice versa. But is there an easier approach?\nIf we take random variability into account, the slopes in the strata don’t appear to change much. If these slopes are in fact the same, this implies that \\(\\beta_1(x_2)\\) and \\(\\beta_2(x_1)\\) are constants. This in turn implies that the expectation of runs conditioned on HR and BB can be written like this:\n\\[\n\\mbox{E}[R \\mid BB = x_1, \\, HR = x_2] = \\beta_0 + \\beta_1 x_1 + \\beta_2 x_2\n\\]\nThis model suggests that if the number of HR is fixed at \\(x_2\\), we observe a linear relationship between runs and BB with an intercept of \\(\\beta_0 + \\beta_2 x_2\\). Our exploratory data analysis suggested that this is the case. The model also suggests that as the number of HR grows, the intercept growth is linear as well and determined by \\(\\beta_1\\). In this analysis, referred to as multivariable regression, you will often hear people say that the BB slope \\(\\beta_1\\) is adjusted for the HR effect.\nBecause the data is approximately normal and conditional distributions were also normal we are justified in using a linear model:\n\\[\nY_i = \\beta_0 + \\beta_1 x_{i,1} + \\beta_2 x_{i,2} + \\varepsilon_i\n\\]\nwith \\(Y_i\\) runs per game for team \\(i\\), \\(x_{i,1}\\) walks per game, and \\(x_{i,2}\\). To use lm here, we need to let the function know we have two predictor variables. So we use the + symbol as follows:\n\ntidy(lm(r ~ bb + hr, data = dat), conf.int = TRUE) \n#&gt; # A tibble: 3 × 7\n#&gt;   term        estimate std.error statistic   p.value conf.low conf.high\n#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;\n#&gt; 1 (Intercept)    1.74     0.0820      21.2 3.38e- 83    1.58      1.90 \n#&gt; 2 bb             0.387    0.0269      14.4 8.41e- 43    0.334     0.440\n#&gt; 3 hr             1.57     0.0488      32.1 1.39e-157    1.47      1.66\n\nWhen we fit the model with only one variable, the estimated slopes were 0.7388725 and 1.8517449 for BB and HR, respectively. Note that when fitting the multivariable model both go down, with the BB effect decreasing much more.\n\n\n\n\n\n\nYou are ready to do exercises 1-12 if you want to practice before continuing.\n\n\n\n\n14.4.1 Building a baseball team\nNow we want to construct a metric to pick players, and we need to consider singles, doubles, and triples as well. Can we build a model that predicts runs based on all these outcomes? We take somewhat of a “leap of faith” and assume that these five variables are jointly normal. This means that if we pick any one of them, and hold the other four fixed, the relationship with the outcome is linear and the slope does not depend on the four values held constant. If this is true, then a linear model for our data is:\n\\[\nY_i = \\beta_0 + \\beta_1 x_{i,1} + \\beta_2 x_{i,2} + \\beta_3 x_{i,3}+ \\beta_4 x_{i,4} + \\beta_5 x_{i,5} + \\varepsilon_i\n\\]\nwith \\(x_{i,1}, x_{i,2}, x_{i,3}, x_{i,4}, x_{i,5}\\) representing BB, singles, doubles, triples, and HR respectively.\nUsing lm, we can quickly find the LSE for the parameters using:\n\nfit &lt;- dat |&gt;  filter(year &lt;= 2001) |&gt; lm(r ~ bb + singles + doubles + triples + hr, data = _)\n\nNote we fit the model to data up until 2001, the year before we will construct our team. We can see the coefficients using tidy:\n\ntidy(fit, conf.int = TRUE) |&gt; filter(term != \"(Intercept)\")\n#&gt; # A tibble: 5 × 7\n#&gt;   term    estimate std.error statistic   p.value conf.low conf.high\n#&gt;   &lt;chr&gt;      &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;\n#&gt; 1 bb         0.370    0.0119      31.2 1.00e-149    0.347     0.393\n#&gt; 2 singles    0.517    0.0128      40.5 5.29e-213    0.492     0.543\n#&gt; 3 doubles    0.775    0.0229      33.8 7.09e-168    0.730     0.820\n#&gt; 4 triples    1.24     0.0778      15.9 4.62e- 51    1.09      1.39 \n#&gt; 5 hr         1.44     0.0248      58.1 1.98e-323    1.39      1.49\n\nTo see how well our metric actually predicts runs, we can predict the number of runs for each team in 2002 using the function predict, then make a plot:\n\ndat |&gt; mutate(r_hat = predict(fit, newdata = dat)) |&gt;\n  filter(year == 2002) %&gt;%\n  ggplot(aes(r_hat, r, label = team)) + \n  geom_point() +\n  geom_text(nudge_x = 0.1, cex = 2) + \n  geom_abline()\n\n\n\n\n\n\n\nOur model does quite a good job as demonstrated by the fact that points from the observed versus predicted plot fall close to the identity line.\nSo instead of using batting average, or just number of HR, as a measure of picking players, we can use our fitted model to form a metric that relates more directly to run production. Specifically, to define a metric for player A, we imagine a team made up of players just like player A and use our fitted regression model to predict how many runs this team would produce. The formula would look like this: -2.7580763 + 0.3699921 \\(\\times\\) BB + 0.5174284 \\(\\times\\) singles + 0.7750757 \\(\\times\\) doubles + 1.2387738 \\(\\times\\) triples + 1.4419724 \\(\\times\\) HR.\nTo define a player-specific metric, we have a bit more work to do. A challenge here is that we derived the metric for teams, based on team-level summary statistics. For example, the HR value that is entered into the equation is HR per game for the entire team. If we compute the HR per game for a player, it will be much lower since the total is accumulated by 9 batters. Furthermore, if a player only plays part of the game and gets fewer opportunities than average, it is still considered a game played. For players, a rate that takes into account opportunities is the per-plate-appearance rate.\nTo make the per-game team rate comparable to the per-plate-appearance player rate, we compute the average number of team plate appearances per game:\n\npa_per_game &lt;- Batting |&gt; filter(yearID == 2002) |&gt; \n  group_by(teamID) |&gt;\n  summarize(pa_per_game = sum(AB + BB)/162) |&gt; \n  pull(pa_per_game) |&gt; \n  mean()\n\nWe compute the per-plate-appearance rates for players available in 2002 on data from 1997-2001. To avoid small sample artifacts, we filter players with less than 1,000 plate appearances per year. Here is the entire calculation in one line:\n\nplayers &lt;- Batting |&gt; \n  filter(yearID %in% 1997:2001) |&gt; \n  group_by(playerID) |&gt;\n  mutate(pa = BB + AB) |&gt;\n  summarize(g = sum(pa)/pa_per_game,\n    bb = sum(BB)/g,\n    singles = sum(H - X2B - X3B - HR)/g,\n    doubles = sum(X2B)/g, \n    triples = sum(X3B)/g, \n    hr = sum(HR)/g,\n    avg = sum(H)/sum(AB),\n    pa = sum(pa)) |&gt;\n  filter(pa &gt;= 1000) |&gt;\n  select(-g)\n\nplayers$r_hat = predict(fit, newdata = players)\n\nThe player-specific predicted runs computed here can be interpreted as the number of runs we predict a team will score if all batters are exactly like that player. The distribution shows that there is wide variability across players:\n\nhist(players$r_hat, main = \"Predicted runs per game\")\n\n\n\n\n\n\n\nTo actually build the team, we will need to know their salaries as well as their defensive position. For this, we use the righ_join function to combine the players data frame we just created with the player information data frame included in some of the other Lahman data tables.\nStart by adding the 2002 salary of each player:\n\nplayers &lt;- Salaries |&gt; \n  filter(yearID == 2002) |&gt;\n  select(playerID, salary) |&gt;\n  right_join(players, by = \"playerID\")\n\nNext, we add their defensive position. This is a somewhat complicated task because players play more than one position each year. The Lahman package table Appearances tells how many games each player played in each position, so we can pick the position that was most played using which.max on each row. We use apply to do this. However, because some players are traded, they appear more than once on the table, so we first sum their appearances across teams. Here, we pick the one position the player most played using the top_n function. To make sure we only pick one position, in the case of ties, we pick the first row of the resulting data frame. We also remove the OF position which stands for outfielder, a generalization of three positions: left field (LF), center field (CF), and right field (RF). We also remove pitchers since they don’t bat in the league in which the A’s play.\n\nposition_names &lt;- \n  paste0(\"G_\", c(\"p\",\"c\",\"1b\",\"2b\",\"3b\",\"ss\",\"lf\",\"cf\",\"rf\", \"dh\"))\n\ntmp &lt;- Appearances |&gt; \n  filter(yearID == 2002) |&gt; \n  group_by(playerID) |&gt;\n  summarize_at(position_names, sum) |&gt;\n  ungroup()\n  \npos &lt;- tmp |&gt;\n  select(all_of(position_names)) |&gt;\n  apply(X = _, 1, which.max) \n\nplayers &lt;- tibble(playerID = tmp$playerID, POS = position_names[pos]) |&gt;\n  mutate(POS = str_to_upper(str_remove(POS, \"G_\"))) |&gt;\n  filter(POS != \"P\") |&gt;\n  right_join(players, by = \"playerID\") |&gt;\n  filter(!is.na(POS)  & !is.na(salary))\n\nFinally, we add their first and last name:\n\nplayers &lt;- People |&gt;\n  select(playerID, nameFirst, nameLast, debut) |&gt;\n  mutate(debut = as.Date(debut)) |&gt;\n  right_join(players, by = \"playerID\")\n\nIf you are a baseball fan, you will recognize the top 10 players:\n\nplayers |&gt; select(nameFirst, nameLast, POS, salary, r_hat) |&gt; arrange(desc(r_hat)) |&gt; head(10) \n#&gt;    nameFirst nameLast POS   salary r_hat\n#&gt; 1      Barry    Bonds  LF 15000000  8.05\n#&gt; 2      Larry   Walker  RF 12666667  7.96\n#&gt; 3       Todd   Helton  1B  5000000  7.40\n#&gt; 4      Manny  Ramirez  LF 15462727  7.35\n#&gt; 5      Sammy     Sosa  RF 15000000  7.20\n#&gt; 6       Jeff  Bagwell  1B 11000000  7.05\n#&gt; 7       Mike   Piazza   C 10571429  6.99\n#&gt; 8      Jason   Giambi  1B 10428571  6.92\n#&gt; 9      Edgar Martinez  DH  7086668  6.91\n#&gt; 10       Jim    Thome  1B  8000000  6.89\n\nOn average, players with a higher metric have higher salaries:\n\nplayers |&gt; ggplot(aes(salary, r_hat, color = POS)) + \n  geom_point() +\n  scale_x_log10()\n\n\n\n\n\n\n\nWe can search for good deals by looking at players who produce many more runs than others with similar salaries. We can use this table to decide what players to pick and keep our total salary below the 40 million dollars Billy Beane had to work with. This can be done using what computer scientists call linear programming. This is not something we teach, but here are the position players selected with this approach:\n\n\n\n\nnameFirst\nnameLast\nPOS\nsalary\nr_hat\n\n\n\nTodd\nHelton\n1B\n5000000\n7.40\n\n\nMike\nPiazza\nC\n10571429\n6.99\n\n\nEdgar\nMartinez\nDH\n7086668\n6.91\n\n\nJim\nEdmonds\nCF\n7333333\n6.23\n\n\nJeff\nKent\n2B\n6000000\n6.08\n\n\nPhil\nNevin\n3B\n2600000\n5.86\n\n\nMatt\nStairs\nRF\n500000\n5.76\n\n\nHenry\nRodriguez\nLF\n300000\n5.64\n\n\nJohn\nValentin\nSS\n550000\n5.00\n\n\n\n\n\nWe see that all these players have above average BB and most have above average HR rates, while the same is not true for singles and batting average. Here is a table with statistics standardized across players so that, for example, above average HR hitters have values above 0.\n\n\n\n\nnameLast\nbb\nsingles\ndoubles\ntriples\nhr\navg\nr_hat\n\n\n\nHelton\n0.909\n-0.215\n2.649\n-0.311\n1.522\n2.670\n2.542\n\n\nPiazza\n0.328\n0.423\n0.204\n-1.418\n1.825\n2.199\n2.093\n\n\nMartinez\n2.135\n-0.005\n1.265\n-1.224\n0.808\n2.203\n2.004\n\n\nEdmonds\n1.071\n-0.558\n0.791\n-1.152\n0.973\n0.854\n1.259\n\n\nKent\n0.232\n-0.732\n2.011\n0.448\n0.766\n0.787\n1.093\n\n\nNevin\n0.307\n-0.905\n0.479\n-1.191\n1.193\n0.105\n0.850\n\n\nStairs\n1.100\n-1.513\n-0.046\n-1.129\n1.121\n-0.561\n0.742\n\n\nRodriguez\n0.201\n-1.596\n0.332\n-0.782\n1.320\n-0.672\n0.613\n\n\nValentin\n0.180\n-0.929\n1.794\n-0.435\n-0.045\n-0.472\n-0.088"
+    "title": "15  Multivariate Regression",
+    "section": "\n15.4 Multivariable regression",
+    "text": "15.4 Multivariable regression\nIt is somewhat complex to be computing regression lines for each strata. We are essentially fitting models like this:\n\\[\n\\mbox{E}[R \\mid BB = x_1, \\, HR = x_2] = \\beta_0 + \\beta_1(x_2) x_1 + \\beta_2(x_1) x_2\n\\]\nwith the slopes for \\(x_1\\) changing for different values of \\(x_2\\) and vice versa. But is there an easier approach?\nIf we take random variability into account, the slopes in the strata don’t appear to change much. If these slopes are in fact the same, this implies that \\(\\beta_1(x_2)\\) and \\(\\beta_2(x_1)\\) are constants. This, in turn, implies that the expectation of runs conditioned on HR and BB can be written as follows:\n\\[\n\\mbox{E}[R \\mid BB = x_1, \\, HR = x_2] = \\beta_0 + \\beta_1 x_1 + \\beta_2 x_2\n\\]\nThis model suggests that, if the number of HR is fixed at \\(x_2\\), we observe a linear relationship between runs and BB with an intercept of \\(\\beta_0 + \\beta_2 x_2\\). Our exploratory data analysis suggested that this is the case. The model also suggests that as the number of HR grows, the intercept growth is linear as well and determined by \\(\\beta_1\\). In this analysis, referred to as multivariable regression, you will often hear people say that the BB slope \\(\\beta_1\\) is adjusted for the HR effect.\nBecause the data is approximately normal and conditional distributions were also normal, we are justified in using a linear model:\n\\[\nY_i = \\beta_0 + \\beta_1 x_{i,1} + \\beta_2 x_{i,2} + \\varepsilon_i\n\\]\nwith \\(Y_i\\) runs per game for team \\(i\\), \\(x_{i,1}\\) walks per game, and \\(x_{i,2}\\). To use lm here, we need to let the function know we have two predictor variables. We therefore use the + symbol as follows:\n\ntidy(lm(r ~ bb + hr, data = dat), conf.int = TRUE) \n#&gt; # A tibble: 3 × 7\n#&gt;   term        estimate std.error statistic   p.value conf.low conf.high\n#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;\n#&gt; 1 (Intercept)    1.74     0.0820      21.2 3.38e- 83    1.58      1.90 \n#&gt; 2 bb             0.387    0.0269      14.4 8.41e- 43    0.334     0.440\n#&gt; 3 hr             1.57     0.0488      32.1 1.39e-157    1.47      1.66\n\nWhen we fit the model with only one variable, the estimated slopes were 0.7388725 and 1.8517449 for BB and HR, respectively. Note that when fitting the multivariable model both go down, with the BB effect decreasing much more.\n\n\n\n\n\n\nYou are ready to do exercises 1-12, if you want to practice before continuing.\n\n\n\n\n15.4.1 Building a baseball team\nNow we want to construct a metric to pick players, and we need to consider singles, doubles, and triples as well. Can we build a model that predicts runs based on all these outcomes? We take somewhat of a “leap of faith” and assume that these five variables are jointly normal. This means that, if we pick any one of them and hold the other four fixed, the relationship with the outcome is linear and the slope does not depend on the four values held constant. If this is true, then a linear model for our data is:\n\\[\nY_i = \\beta_0 + \\beta_1 x_{i,1} + \\beta_2 x_{i,2} + \\beta_3 x_{i,3}+ \\beta_4 x_{i,4} + \\beta_5 x_{i,5} + \\varepsilon_i\n\\]\nwith \\(x_{i,1}, x_{i,2}, x_{i,3}, x_{i,4}, x_{i,5}\\) representing BB, singles, doubles, triples, and HR respectively.\nUsing lm, we can quickly find the LSE for the parameters using:\n\nfit &lt;- dat |&gt;  filter(year &lt;= 2001) |&gt; lm(r ~ bb + singles + doubles + triples + hr, data = _)\n\nBe aware that we fit the model to data up until 2001, the year before we will construct our team. We can see the coefficients using tidy:\n\ntidy(fit, conf.int = TRUE) |&gt; filter(term != \"(Intercept)\")\n#&gt; # A tibble: 5 × 7\n#&gt;   term    estimate std.error statistic   p.value conf.low conf.high\n#&gt;   &lt;chr&gt;      &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;\n#&gt; 1 bb         0.370    0.0119      31.2 1.00e-149    0.347     0.393\n#&gt; 2 singles    0.517    0.0128      40.5 5.29e-213    0.492     0.543\n#&gt; 3 doubles    0.775    0.0229      33.8 7.09e-168    0.730     0.820\n#&gt; 4 triples    1.24     0.0778      15.9 4.62e- 51    1.09      1.39 \n#&gt; 5 hr         1.44     0.0248      58.1 1.98e-323    1.39      1.49\n\nTo see how well our metric actually predicts runs, we can predict the number of runs for each team in 2002 using the function predict, then make a plot:\n\ndat |&gt; mutate(r_hat = predict(fit, newdata = dat)) |&gt;\n  filter(year == 2002) %&gt;%\n  ggplot(aes(r_hat, r, label = team)) + \n  geom_point() +\n  geom_text(nudge_x = 0.1, cex = 2) + \n  geom_abline()\n\n\n\n\n\n\n\nOur model does quite a good job, as demonstrated by the fact that points from the observed versus predicted plot fall close to the identity line.\nSo instead of using batting average, or just number of HR, as a measure of picking players, we can use our fitted model to form a metric that relates more directly to run production. Specifically, to define a metric for player A, we imagine a team made up of players just like player A, and use our fitted regression model to predict how many runs this team would produce. The formula would look like this: -2.7580763 + 0.3699921 \\(\\times\\) BB + 0.5174284 \\(\\times\\) singles + 0.7750757 \\(\\times\\) doubles + 1.2387738 \\(\\times\\) triples + 1.4419724 \\(\\times\\) HR.\nTo define a player-specific metric, we have a bit more work to do. A challenge here is that we derived the metric for teams, based on team-level summary statistics. For example, the HR value that is entered into the equation is HR per game for the entire team. If we compute the HR per game for a player, it will be much lower since the total is accumulated by 9 batters. Furthermore, if a player only plays part of the game and gets fewer opportunities than average, it is still considered a game played. For players, a rate that takes into account opportunities is the per-plate-appearance rate.\nTo make the per-game team rate comparable to the per-plate-appearance player rate, we compute the average number of team plate appearances per game:\n\npa_per_game &lt;- Batting |&gt; filter(yearID == 2002) |&gt; \n  group_by(teamID) |&gt;\n  summarize(pa_per_game = sum(AB + BB)/162) |&gt; \n  pull(pa_per_game) |&gt; \n  mean()\n\nWe compute the per-plate-appearance rates for players available in 2002 on data from 1997-2001. To avoid small sample artifacts, we filter players with less than 1,000 plate appearances per year. Here is the entire calculation in one line:\n\nplayers &lt;- Batting |&gt; \n  filter(yearID %in% 1997:2001) |&gt; \n  group_by(playerID) |&gt;\n  mutate(pa = BB + AB) |&gt;\n  summarize(g = sum(pa)/pa_per_game,\n    bb = sum(BB)/g,\n    singles = sum(H - X2B - X3B - HR)/g,\n    doubles = sum(X2B)/g, \n    triples = sum(X3B)/g, \n    hr = sum(HR)/g,\n    avg = sum(H)/sum(AB),\n    pa = sum(pa)) |&gt;\n  filter(pa &gt;= 1000) |&gt;\n  select(-g)\n\nplayers$r_hat = predict(fit, newdata = players)\n\nThe player-specific predicted runs computed here can be interpreted as the number of runs we predict a team will score if all batters are exactly like that player. The distribution shows that there is wide variability across players:\n\nhist(players$r_hat, main = \"Predicted runs per game\")\n\n\n\n\n\n\n\nTo actually build the team, we will need to know their salaries as well as their defensive position. For this, we use the righ_join function to combine the players data frame we just created with the player information data frame included in some of the other Lahman data tables.\nStart by adding the 2002 salary of each player:\n\nplayers &lt;- Salaries |&gt; \n  filter(yearID == 2002) |&gt;\n  select(playerID, salary) |&gt;\n  right_join(players, by = \"playerID\")\n\nNext, we add their defensive position. This is a somewhat complicated task because players play more than one position each year. The Lahman package table Appearances specifies how many games each player played in each position, allowing us to pick the position that was most played using which.max on each row. We use apply to do this. However, as some players are traded and appear more than once on the table, we first sum their appearances across teams. Here, we pick the one position the player most played using the top_n function. To make sure we only pick one position, in the case of ties, we pick the first row of the resulting data frame. We also remove the OF position which stands for outfielder, a generalization of three positions: left field (LF), center field (CF), and right field (RF). We also remove pitchers since they don’t bat in the league where the A’s play.\n\nposition_names &lt;- \n  paste0(\"G_\", c(\"p\",\"c\",\"1b\",\"2b\",\"3b\",\"ss\",\"lf\",\"cf\",\"rf\", \"dh\"))\n\ntmp &lt;- Appearances |&gt; \n  filter(yearID == 2002) |&gt; \n  group_by(playerID) |&gt;\n  summarize_at(position_names, sum) |&gt;\n  ungroup()\n  \npos &lt;- tmp |&gt;\n  select(all_of(position_names)) |&gt;\n  apply(X = _, 1, which.max) \n\nplayers &lt;- tibble(playerID = tmp$playerID, POS = position_names[pos]) |&gt;\n  mutate(POS = str_to_upper(str_remove(POS, \"G_\"))) |&gt;\n  filter(POS != \"P\") |&gt;\n  right_join(players, by = \"playerID\") |&gt;\n  filter(!is.na(POS)  & !is.na(salary))\n\nFinally, we add their first and last name:\n\nplayers &lt;- People |&gt;\n  select(playerID, nameFirst, nameLast, debut) |&gt;\n  mutate(debut = as.Date(debut)) |&gt;\n  right_join(players, by = \"playerID\")\n\nIf you are a baseball fan, you will recognize the top 10 players:\n\nplayers |&gt; select(nameFirst, nameLast, POS, salary, r_hat) |&gt; arrange(desc(r_hat)) |&gt; head(10) \n#&gt;    nameFirst nameLast POS   salary r_hat\n#&gt; 1      Barry    Bonds  LF 15000000  8.05\n#&gt; 2      Larry   Walker  RF 12666667  7.96\n#&gt; 3       Todd   Helton  1B  5000000  7.40\n#&gt; 4      Manny  Ramirez  LF 15462727  7.35\n#&gt; 5      Sammy     Sosa  RF 15000000  7.20\n#&gt; 6       Jeff  Bagwell  1B 11000000  7.05\n#&gt; 7       Mike   Piazza   C 10571429  6.99\n#&gt; 8      Jason   Giambi  1B 10428571  6.92\n#&gt; 9      Edgar Martinez  DH  7086668  6.91\n#&gt; 10       Jim    Thome  1B  8000000  6.89\n\nOn average, players with a higher metric have higher salaries:\n\nplayers |&gt; ggplot(aes(salary, r_hat, color = POS)) + \n  geom_point() +\n  scale_x_log10()\n\n\n\n\n\n\n\nWe can search for good deals by looking at players who generate many more runs than others with similar salaries. We can use this table to decide what players to pick while keeping our total salary below the 40 million dollar budget Billy Beane had to work with. This can be done using what computer scientists call linear programming. This is not something we teach, although here are the position players selected with this approach:\n\n\n\n\nnameFirst\nnameLast\nPOS\nsalary\nr_hat\n\n\n\nTodd\nHelton\n1B\n5000000\n7.40\n\n\nMike\nPiazza\nC\n10571429\n6.99\n\n\nEdgar\nMartinez\nDH\n7086668\n6.91\n\n\nJim\nEdmonds\nCF\n7333333\n6.23\n\n\nJeff\nKent\n2B\n6000000\n6.08\n\n\nPhil\nNevin\n3B\n2600000\n5.86\n\n\nMatt\nStairs\nRF\n500000\n5.76\n\n\nHenry\nRodriguez\nLF\n300000\n5.64\n\n\nJohn\nValentin\nSS\n550000\n5.00\n\n\n\n\n\nWe see that all these players have above average BB and most have above average HR rates, while the same is not true for singles and batting average. Below is a table with statistics standardized across players so that, for example, above average HR hitters have values above 0:\n\n\n\n\nnameLast\nbb\nsingles\ndoubles\ntriples\nhr\navg\nr_hat\n\n\n\nHelton\n0.909\n-0.215\n2.649\n-0.311\n1.522\n2.670\n2.542\n\n\nPiazza\n0.328\n0.423\n0.204\n-1.418\n1.825\n2.199\n2.093\n\n\nMartinez\n2.135\n-0.005\n1.265\n-1.224\n0.808\n2.203\n2.004\n\n\nEdmonds\n1.071\n-0.558\n0.791\n-1.152\n0.973\n0.854\n1.259\n\n\nKent\n0.232\n-0.732\n2.011\n0.448\n0.766\n0.787\n1.093\n\n\nNevin\n0.307\n-0.905\n0.479\n-1.191\n1.193\n0.105\n0.850\n\n\nStairs\n1.100\n-1.513\n-0.046\n-1.129\n1.121\n-0.561\n0.742\n\n\nRodriguez\n0.201\n-1.596\n0.332\n-0.782\n1.320\n-0.672\n0.613\n\n\nValentin\n0.180\n-0.929\n1.794\n-0.435\n-0.045\n-0.472\n-0.088"
   },
   {
     "objectID": "linear-models/multivariate-regression.html#exercises",
     "href": "linear-models/multivariate-regression.html#exercises",
-    "title": "14  Multivariate Regression",
-    "section": "\n14.5 Exercises",
-    "text": "14.5 Exercises\nWe have shown how BB and singles have similar predictive power for scoring runs. Another way to compare the usefulness of these baseball metrics is by assessing how stable they are across the years. Since we have to pick players based on their previous performances, we will prefer metrics that are more stable. In these exercises, we will compare the stability of singles and BBs.\n1. Before we get started, we want to generate two tables. One for 2002 and another for the average of 1999-2001 seasons. We want to define per plate appearance statistics. Here is how we create the 2017 table. Keeping only players with more than 100 plate appearances.\n\nlibrary(Lahman)\ndat &lt;- Batting |&gt; filter(yearID == 2002) |&gt;\n  mutate(pa = AB + BB, \n         singles = (H - X2B - X3B - HR) / pa, bb = BB / pa) |&gt;\n  filter(pa &gt;= 100) |&gt;\n  select(playerID, singles, bb)\n\nNow compute a similar table but with rates computed over 1999-2001.\n2. You can use the inner_join function to combine the 2001 data and averages in the same table:\n\ndat &lt;- inner_join(dat, avg, by = \"playerID\")\n\nCompute the correlation between 2002 and the previous seasons for singles and BB.\n3. Note that the correlation is higher for BB. To quickly get an idea of the uncertainty associated with this correlation estimate, we will fit a linear model and compute confidence intervals for the slope coefficient. However, first make scatterplots to confirm that fitting a linear model is appropriate.\n4. Now fit a linear model for each metric and use the confint function to compare the estimates.\n5. In a previous section, we computed the correlation between mothers and daughters, mothers and sons, fathers and daughters, and fathers and sons, and noticed that the highest correlation is between fathers and sons and the lowest is between mothers and sons. We can compute these correlations using:\n\nlibrary(HistData)\nset.seed(1)\ngalton_heights &lt;- GaltonFamilies |&gt;\n  group_by(family, gender) |&gt;\n  sample_n(1) |&gt;\n  ungroup()\n\ncors &lt;- galton_heights |&gt; \n  pivot_longer(father:mother, names_to = \"parent\", values_to = \"parentHeight\") |&gt;\n  mutate(child = ifelse(gender == \"female\", \"daughter\", \"son\")) |&gt;\n  unite(pair, c(\"parent\", \"child\")) |&gt; \n  group_by(pair) |&gt;\n  summarize(cor = cor(parentHeight, childHeight))\n\nAre these differences statistically significant? To answer this, we will compute the slopes of the regression line along with their standard errors. Start by using lm and the broom package to compute the slopes LSE and the standard errors.\n6. Repeat the exercise above, but compute a confidence interval as well.\n7. Plot the confidence intervals and notice that they overlap, which implies that the data is consistent with the inheritance of height being independent of sex.\n8. Because we are selecting children at random, we can actually do something like a permutation test here. Repeat the computation of correlations 100 times taking a different sample each time. Hint: use similar code to what we used with simulations.\n9. Fit a linear regression model to obtain the effects of BB and HR on Runs (at the team level) in 1971. Use the tidy function in the broom package to obtain the results in a data frame.\n10. Now let’s repeat the above for each year since 1962 and make a plot. Use summarize and the broom package to fit this model for every year since 1962.\n11. Use the results of the previous exercise to plot the estimated effects of BB on runs.\n12. Advanced. Write a function that takes R, HR, and BB as arguments and fits two linear models: R ~ BB and R~BB+HR. Then use the summary function to obtain the BB for both models for each year since 1962. Then plot these against each other as a function of time.\n13. Since the 1980s, sabermetricians have used a summary statistic different from batting average to evaluate players. They realized walks were important and that doubles, triples, and HRs, should be weighed more than singles. As a result, they proposed the following metric:\n\\[\n\\frac{\\mbox{BB}}{\\mbox{PA}} + \\frac{\\mbox{Singles} + 2 \\mbox{Doubles} + 3 \\mbox{Triples} + 4\\mbox{HR}}{\\mbox{AB}}\n\\]\nThey called this on-base-percentage plus slugging percentage (OPS). Although the sabermetricians probably did not use regression, here we show how this metric is close to what one gets with regression.\nCompute the OPS for each team in the 2001 season. Then plot Runs per game versus OPS.\n14. For every year since 1962, compute the correlation between runs per game and OPS; then plot these correlations as a function of year.\n15. Note that we can rewrite OPS as a weighted average of BBs, singles, doubles, triples, and HRs. We know that the weights for doubles, triples, and HRs are 2, 3, and 4 times that of singles. But what about BB? What is the weight for BB relative to singles? Hint: the weight for BB relative to singles will be a function of AB and PA.\n16. Note that the weight for BB, \\(\\frac{\\mbox{AB}}{\\mbox{PA}}\\), will change from team to team. To see how variable it is, compute and plot this quantity for each team for each year since 1962. Then plot it again, but instead of computing it for every team, compute and plot the ratio for the entire year. Then, once you are convinced that there is not much of a time or team trend, report the overall average.\n17. So now we know that the formula for OPS is proportional to \\(0.91 \\times \\mbox{BB} + \\mbox{singles} + 2 \\times \\mbox{doubles} + 3 \\times \\mbox{triples} + 4 \\times \\mbox{HR}\\). Let’s see how these coefficients compare to those obtained with regression. Fit a regression model to the data after 1962, as done earlier: using per game statistics for each year for each team. After fitting this model, report the coefficients as weights relative to the coefficient for singles.\n18. We see that our linear regression model coefficients follow the same general trend as those used by OPS, but with slightly less weight for metrics other than singles. For each team in years after 1962, compute the OPS, the predicted runs with the regression model and compute the correlation between the two as well as the correlation with runs per game.\n19. We see that using the regression approach predicts runs slightly better than OPS, but not that much. However, note that we have been computing OPS and predicting runs for teams when these measures are used to evaluate players. Let’s show that OPS is quite similar to what one obtains with regression at the player level. For the 1962 season and after, compute the OPS and the predicted runs from our model for each player and plot them. Use the PA per game correction we used in the previous chapter:\n20. What players have show the largest difference between their rank by predicted runs and OPS?"
+    "title": "15  Multivariate Regression",
+    "section": "\n15.5 Exercises",
+    "text": "15.5 Exercises\nWe have shown how BB and singles have similar predictive power for scoring runs. Another way to compare the usefulness of these baseball metrics is by assessing their stability across the years. Since we have to pick players based on their previous performances, we prefer metrics that are more stable. In these exercises, we will compare the stability of singles and BBs.\n1. Before we begin, we want to generate two tables. One for 2002 and another for the average of 1999-2001 seasons. We want to define per plate appearance statistics. Here is how we create the 2017 table, keeping only players with more than 100 plate appearances:\n\nlibrary(Lahman)\ndat &lt;- Batting |&gt; filter(yearID == 2002) |&gt;\n  mutate(pa = AB + BB, \n         singles = (H - X2B - X3B - HR)/pa, bb = BB/pa) |&gt;\n  filter(pa &gt;= 100) |&gt;\n  select(playerID, singles, bb)\n\nNow, compute a similar table, but with rates computed over 1999-2001.\n2. You can use the inner_join function to combine the 2001 data and averages in the same table:\n\ndat &lt;- inner_join(dat, avg, by = \"playerID\")\n\nCompute the correlation between 2002 and the previous seasons for singles and BB.\n3. Note that the correlation is higher for BB. To quickly get an idea of the uncertainty associated with this correlation estimate, we will fit a linear model and compute confidence intervals for the slope coefficient. However, first make scatterplots to confirm that fitting a linear model is appropriate.\n4. Now fit a linear model for each metric and use the confint function to compare the estimates.\n5. In a previous section, we computed the correlation between mothers and daughters, mothers and sons, fathers and daughters, and fathers and sons. We noticed that the highest correlation is between fathers and sons and the lowest is between mothers and sons. We can compute these correlations using:\n\nlibrary(HistData)\nset.seed(1)\ngalton_heights &lt;- GaltonFamilies |&gt;\n  group_by(family, gender) |&gt;\n  sample_n(1) |&gt;\n  ungroup()\n\ncors &lt;- galton_heights |&gt; \n  pivot_longer(father:mother, names_to = \"parent\", values_to = \"parentHeight\") |&gt;\n  mutate(child = ifelse(gender == \"female\", \"daughter\", \"son\")) |&gt;\n  unite(pair, c(\"parent\", \"child\")) |&gt; \n  group_by(pair) |&gt;\n  summarize(cor = cor(parentHeight, childHeight))\n\nAre these differences statistically significant? To answer this, we will compute the slopes of the regression line along with their standard errors. Start by using lm and the broom package to compute the slopes LSE and the standard errors.\n6. Repeat the exercise above, but compute a confidence interval as well.\n7. Plot the confidence intervals and notice that they overlap, which implies that the data is consistent with the inheritance of height being independent of sex.\n8. Because we are selecting children at random, we can actually do something like a permutation test here. Repeat the computation of correlations 100 times taking a different sample each time. Hint: use similar code to what we used with simulations.\n9. Fit a linear regression model to obtain the effects of BB and HR on Runs (at the team level) in 1971. Use the tidy function in the broom package to obtain the results in a data frame.\n10. Now let’s repeat the above for each year since 1962 and make a plot. Use summarize and the broom package to fit this model for every year since 1962.\n11. Use the results of the previous exercise to plot the estimated effects of BB on runs.\n12. Advanced. Write a function that takes R, HR, and BB as arguments and fits two linear models: R ~ BB and R~BB+HR. Then use the summary function to obtain the BB for both models for each year since 1962. Then plot these against each other as a function of time.\n13. Since the 1980s, sabermetricians have used a summary statistic different from batting average to evaluate players. They realized walks were important and that doubles, triples, and HRs, should be weighed more than singles. As a result, they proposed the following metric:\n\\[\n\\frac{\\mbox{BB}}{\\mbox{PA}} + \\frac{\\mbox{Singles} + 2 \\mbox{Doubles} + 3 \\mbox{Triples} + 4\\mbox{HR}}{\\mbox{AB}}\n\\]\nThey called this on-base-percentage plus slugging percentage (OPS). Although the sabermetricians probably did not use regression, here we demonstrate how this metric closely aligns with regression results.\nCompute the OPS for each team in the 2001 season. Then plot Runs per game versus OPS.\n14. For every year since 1962, compute the correlation between runs per game and OPS. Then plot these correlations as a function of year.\n15. Keep in mind that we can rewrite OPS as a weighted average of BBs, singles, doubles, triples, and HRs. We know that the weights for doubles, triples, and HRs are 2, 3, and 4 times that of singles. But what about BB? What is the weight for BB relative to singles? Hint: the weight for BB relative to singles will be a function of AB and PA.\n16. Consider that the weight for BB, \\(\\frac{\\mbox{AB}}{\\mbox{PA}}\\), will change from team to team. To assess its variability, compute and plot this quantity for each team for each year since 1962. Then plot it again, but instead of computing it for every team, compute and plot the ratio for the entire year. Then, once you are convinced that there is not much of a time or team trend, report the overall average.\n17. So now we know that the formula for OPS is proportional to \\(0.91 \\times \\mbox{BB} + \\mbox{singles} + 2 \\times \\mbox{doubles} + 3 \\times \\mbox{triples} + 4 \\times \\mbox{HR}\\). Let’s see how these coefficients compare to those obtained with regression. Fit a regression model to the data after 1962, as done earlier: using per game statistics for each year for each team. After fitting this model, report the coefficients as weights relative to the coefficient for singles.\n18. We see that our linear regression model coefficients follow the same general trend as those used by OPS, but with slightly less weight for metrics other than singles. For each team in years after 1962, compute the OPS, the predicted runs with the regression model, and compute the correlation between the two, as well as the correlation with runs per game.\n19. We see that using the regression approach predicts runs slightly better than OPS, but not that much. However, note that we have been computing OPS and predicting runs for teams when these measures are used to evaluate players. Let’s show that OPS is quite similar to what one obtains with regression at the player level. For the 1962 season and onward, compute the OPS and the predicted runs from our model for each player, and plot them. Use the PA per game correction we used in the previous chapter:\n20. Which players have shown the largest difference between their rank by predicted runs and OPS?"
   },
   {
     "objectID": "linear-models/multivariate-regression.html#footnotes",
     "href": "linear-models/multivariate-regression.html#footnotes",
-    "title": "14  Multivariate Regression",
+    "title": "15  Multivariate Regression",
     "section": "",
     "text": "http://mlb.mlb.com/stats/league_leaders.jsp↩︎\nhttps://en.wikipedia.org/wiki/Bill_James↩︎\nhttps://en.wikipedia.org/wiki/Sabermetrics↩︎\nhttps://en.wikipedia.org/wiki/User:Cburnett↩︎\nhttps://creativecommons.org/licenses/by-sa/3.0/deed.en↩︎\nhttps://www.flickr.com/people/27003603@N00↩︎\nhttps://creativecommons.org/licenses/by-sa/2.0↩︎\nhttp://www.baseball-almanac.com/awards/lou_brock_award.shtml↩︎"
   },
+  {
+    "objectID": "linear-models/measurement-error-models.html#example-modeling-a-falling-object",
+    "href": "linear-models/measurement-error-models.html#example-modeling-a-falling-object",
+    "title": "\n16  Measurement error models\n",
+    "section": "\n16.1 Example: modeling a falling object",
+    "text": "16.1 Example: modeling a falling object\nTo understand these models, imagine you are Galileo in the 16th century trying to describe the velocity of a falling object. An assistant climbs the Tower of Pisa and drops a ball, while several other assistants record the position at different times. Let’s simulate some data using the equations we currently know and adding some measurement error. The dslabs function rfalling_object generates these simulations:\n\nlibrary(tidyverse)\nlibrary(broom)\nlibrary(dslabs)\nfalling_object &lt;- rfalling_object()\n\nThe assistants hand the data to Galileo, and this is what he sees:\n\nfalling_object |&gt; \n  ggplot(aes(time, observed_distance)) + \n  geom_point() +\n  ylab(\"Distance in meters\") + \n  xlab(\"Time in seconds\")\n\n\n\n\n\n\n\nGalileo does not know the exact equation, but by looking at the plot above, he deduces that the position should follow a parabola, which we can write like this:\n\\[\nf(x) = \\beta_0 + \\beta_1 x + \\beta_2 x^2\n\\]\nThe data does not fall exactly on a parabola. Galileo knows this is due to measurement error. His helpers make mistakes when measuring the distance. To account for this, he models the data with:\n\\[\nY_i = \\beta_0 + \\beta_1 x_i + \\beta_2 x_i^2 + \\varepsilon_i, i=1,\\dots,n\n\\]\nwith \\(Y_i\\) representing distance in meters, \\(x_i\\) representing time in seconds, and \\(\\varepsilon\\) accounting for measurement error. The measurement error is assumed to be random, independent from each other, and having the same distribution for each \\(i\\). We also assume that there is no bias, which means the expected value \\(\\mbox{E}[\\varepsilon] = 0\\).\nNote that this is a linear model because it is a linear combination of known quantities (\\(x\\) and \\(x^2\\) are known) and unknown parameters (the \\(\\beta\\)s are unknown parameters to Galileo). Unlike our previous examples, here \\(x\\) is a fixed quantity; we are not conditioning."
+  },
+  {
+    "objectID": "linear-models/measurement-error-models.html#estimating-parameters-with-least-squares",
+    "href": "linear-models/measurement-error-models.html#estimating-parameters-with-least-squares",
+    "title": "\n16  Measurement error models\n",
+    "section": "\n16.2 Estimating parameters with least squares",
+    "text": "16.2 Estimating parameters with least squares\nTo pose a new physical theory and start making predictions about other falling objects, Galileo needs actual numbers, rather than unknown parameters. Using LSE seems like a reasonable approach. How do we find the LSE?\nLSE calculations do not require the errors to be approximately normal. The lm function will find the \\(\\beta\\)s that will minimize the residual sum of squares:\n\nfit &lt;- falling_object |&gt; \n  mutate(time_sq = time^2) |&gt; \n  lm(observed_distance~time+time_sq, data = _)\ntidy(fit)\n#&gt; # A tibble: 3 × 5\n#&gt;   term        estimate std.error statistic  p.value\n#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;\n#&gt; 1 (Intercept)   56.1       0.861    65.1   1.38e-15\n#&gt; 2 time          -0.618     1.23     -0.503 6.25e- 1\n#&gt; 3 time_sq       -4.72      0.365   -12.9   5.37e- 8\n\nLet’s check if the estimated parabola fits the data. The broom function augment allows us to do this easily:\n\naugment(fit) |&gt; \n  ggplot() +\n  geom_point(aes(time, observed_distance)) + \n  geom_line(aes(time, .fitted), col = \"blue\")\n\n\n\n\n\n\n\nThanks to my high school physics teacher, I know that the equation for the trajectory of a falling object is:\n\\[\nd(t) = h_0 + v_0 t -  0.5 \\times 9.8 \\, t^2\n\\]\nwith \\(h_0\\) and \\(v_0\\) the starting height and velocity, respectively. The data we simulated above followed this equation, adding measurement error to simulate n observations for dropping the ball \\((v_0=0)\\) from the tower of Pisa \\((h_0=55.86)\\).\nThese are consistent with the parameter estimates:\n\ntidy(fit, conf.int = TRUE)\n#&gt; # A tibble: 3 × 7\n#&gt;   term        estimate std.error statistic  p.value conf.low conf.high\n#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;\n#&gt; 1 (Intercept)   56.1       0.861    65.1   1.38e-15    54.2      58.0 \n#&gt; 2 time          -0.618     1.23     -0.503 6.25e- 1    -3.33      2.09\n#&gt; 3 time_sq       -4.72      0.365   -12.9   5.37e- 8    -5.52     -3.92\n\nThe Tower of Pisa height is within the confidence interval for \\(\\beta_0\\), the initial velocity 0 is in the confidence interval for \\(\\beta_1\\) (note the p-value is larger than 0.05), and the acceleration constant is in a confidence interval for \\(-2 \\times \\beta_2\\)."
+  },
   {
     "objectID": "linear-models/measurement-error-models.html#exercises",
     "href": "linear-models/measurement-error-models.html#exercises",
-    "title": "\n15  Measurement error models\n",
-    "section": "\n15.1 Exercises",
-    "text": "15.1 Exercises\n1. Plot of co2 evels for the first 12 months of the co2 dataset and notice it seems to follow a sin wave with frequency 1 cycle per month. This means that a measurement error model that might work is\n\\[\ny_i = \\mu + A \\sin(2\\pi t_i / 12 + \\phi) + \\varepsilon_i\n\\] with \\(t_i\\) the month number of observation \\(i\\). Is this a linear model for the parameters \\(mu\\), \\(A\\) and \\(\\phi\\)?\n2. Using trigonometry we can show that we can rewrite this model as\n$$ y_i = _0 + _1 (2t_i/12) + _2 (2t_i/12) + _i\n$$ Is this a linear model?\n3. Find least square estimates for the \\(\\beta\\)s using lm. Show a plot of \\(y_i\\) versus \\(t_i\\) with a curve on the same plot showing \\(\\hat{Y}_i\\) versus \\(t_i\\).\n4. Now fit a measurement error model to the entire co2 dataset that includes a trend term that is a parabola as well as the sine wave model.\n5. Run diagnostic plots for the fitted model and describe the results."
+    "title": "\n16  Measurement error models\n",
+    "section": "\n16.3 Exercises",
+    "text": "16.3 Exercises\n1. Plot CO2 levels for the first 12 months of the co2 dataset and notice it seems to follow a sin wave with a frequency of 1 cycle per month. This means that a measurement error model that might work is\n\\[\ny_i = \\mu + A \\sin(2\\pi \\,t_i / 12 + \\phi) + \\varepsilon_i\n\\] with \\(t_i\\) the month number for observation \\(i\\). Is this a linear model for the parameters \\(mu\\), \\(A\\) and \\(\\phi\\)?\n2. Using trigonometry, we can show that we can rewrite this model as:\n\\[\ny_i = \\beta_0 + \\beta_1 \\sin(2\\pi t_i/12) + \\beta_2 \\cos(2\\pi t_i/12) + \\varepsilon_i\n\\]\nIs this a linear model?\n3. Find least square estimates for the \\(\\beta\\)s using lm. Show a plot of \\(y_i\\) versus \\(t_i\\) with a curve on the same plot showing \\(\\hat{Y}_i\\) versus \\(t_i\\).\n4. Now fit a measurement error model to the entire co2 dataset that includes a trend term that is a parabola as well as the sine wave model.\n5. Run diagnostic plots for the fitted model and describe the results."
   },
   {
     "objectID": "linear-models/treatment-effect-models.html#comparing-group-means",
     "href": "linear-models/treatment-effect-models.html#comparing-group-means",
-    "title": "16  Treatment effect models",
-    "section": "\n16.1 Comparing group means",
-    "text": "16.1 Comparing group means\nThe sample averages for the two groups, high-fat and chow diets, are different:\n\nlibrary(tidyverse)\nmice_weights |&gt; group_by(diet) |&gt; summarize(average = mean(body_weight))\n#&gt; # A tibble: 2 × 2\n#&gt;   diet  average\n#&gt;   &lt;fct&gt;   &lt;dbl&gt;\n#&gt; 1 chow     31.5\n#&gt; 2 hf       36.7\n\nBut this is a random sample of mice and the assignment to the diet group is also random. So is this difference due to chance? We will use hypothesis testing, first described in Chapter Chapter 9, to answer this question.\nDenote with \\(\\mu_1\\) and \\(\\sigma_1\\) the weight average and standard deviation we would observe if the entire population of mice were on the high-fat diet. Define \\(\\mu_0\\) and \\(\\sigma_0\\) similarly for the chow diet. Define \\(N_1\\) and \\(N_0\\) as the sample sizes, let’s call them \\(\\bar{X}_1\\) and \\(\\bar{X}_0\\) as the sample averages, and \\(s_1\\) and \\(s_0\\) the sample standard deviations for the for the high-fat and chow diets, respectively. Because this is a random sample the central limit theorem tells us that the difference in averages \\(bar{X}_1 - \\bar{X}_0\\) follows a normal distribution with expected value \\(\\mu_1-\\mu_0\\) and standard error \\(\\sqrt{\\frac{s_1^2}{N_1} + \\frac{s_0^2}{N_0}}\\). If we define the null hypothesis as the high-fat diet having no effect, or \\(\\mu_1 - \\mu_0 = 0\\), the the following summary statistic\n\\[\nt = \\frac{\\bar{X}_1 - \\bar{X}_0}{\\sqrt{\\frac{s_1^2}{N_1} + \\frac{s_0^2}{N_0}}}\n\\]\nfollows a standard normal distribution when the null hypothesis is true, which implies we can easily compute the probability of observing a value as large as the one we did:\n\nstats &lt;- mice_weights |&gt; group_by(diet) |&gt; summarize(xbar = mean(body_weight), s = sd(body_weight), n = n()) \nt_stat &lt;- with(stats, (xbar[2] - xbar[1])/sqrt(s[2]^2/n[2] + s[1]^2/n[1]))\nt_stat\n#&gt; [1] 9.34\n\nHere \\(t\\) is well over 3, so we don’t really need to compute the p-value 1-pnorm(t_stat) as we know it will be very small.\nNote that when \\(N\\) is not large, then the CLT does not apply. However, if the outcome data, in this case weight, follows a normal distribution, then \\(t\\) follows a t-distribution with \\(N_1+N_2-2\\) degrees of freedom. So the calculation of the p-value is the same except we use 1-pt(t_stat, with(stats, n[2]+n[1]-2) to compute the p-value.\nBecause using differences in mean are so common in scientific studies, this t-statistic is one of the most widely reported summaries. When use it in a hypothesis testing setting, it is referred to a performing a t test.\n\n\n\n\n\n\nIn the computation above we computed the probability of t being as large as what we observed. However, when we are equally interested in both directions, for example, either an increase or decrease in weight, then we need to compute the probability of t being as extreme as what we observe. The formula simply changes to using the absolute value: 1 - pnorm(abs(t-test)) or 1-pt(t_stat, with(stats, n[2]+n[1]-2)."
+    "title": "17  Treatment effect models",
+    "section": "\n17.1 Comparing group means",
+    "text": "17.1 Comparing group means\nThe sample averages for the two groups, high-fat and chow diets, are different:\n\nlibrary(tidyverse)\nmice_weights |&gt; group_by(diet) |&gt; summarize(average = mean(body_weight))\n#&gt; # A tibble: 2 × 2\n#&gt;   diet  average\n#&gt;   &lt;fct&gt;   &lt;dbl&gt;\n#&gt; 1 chow     31.5\n#&gt; 2 hf       36.7\n\nHowever, this is a random sample of mice, and the assignment to the diet group is also done randomly. So is this difference due to chance? We will use hypothesis testing, first described in Chapter 9, to answer this question.\nLet \\(\\mu_1\\) and \\(\\sigma_1\\) represent the weight average and standard deviation, respectively, that we would observe if the entire population of mice were on the high-fat diet. Define \\(\\mu_0\\) and \\(\\sigma_0\\) similarly, but for the chow diet. Define \\(N_1\\) and \\(N_0\\) as the sample sizes, and \\(\\bar{X}_1\\) and \\(\\bar{X}_0\\) the sample averages, for the for the high-fat and chow diets, respectively.\nSince the data comes from a random sample, the central limit theorem tells us that, if the sample is large enough, the difference in averages \\(bar{X}_1 - \\bar{X}_0\\) follows a normal distribution, with expected value \\(\\mu_1-\\mu_0\\) and standard error \\(\\sqrt{\\frac{\\sigma_1^2}{N_1} + \\frac{\\sigma_0^2}{N_0}}\\).\nIf we define the null hypothesis as the high-fat diet having no effect, or \\(\\mu_1 - \\mu_0 = 0\\), this implies that\n\\[\n\\frac{\\bar{X}_1 - \\bar{X}_0}{\\sqrt{\\frac{\\sigma_1^2}{N_1} + \\frac{\\sigma_0^2}{N_0}}}\n\\]\nhas expected value 0 and standard error 1 and therefore approximately follows a standard normal distribution.\nNote that we can’t compute this quantity in practice because the \\(\\sigma_1\\) and \\(\\sigma_0\\) are unknown. However, if we estimate them with the sample standard deviations, denote them \\(s_1\\) and \\(s_0\\) for the high-fat and chow diets, respectively, the central limit still holds and tells us that\n\\[\nt = \\frac{\\bar{X}_1 - \\bar{X}_0}{\\sqrt{\\frac{s_1^2}{N_1} + \\frac{s_0^2}{N_0}}}\n\\]\nfollows a standard normal distribution when the null hypothesis is true. This implies that we can easily compute the probability of observing a value as large as the one we obtained:\n\nstats &lt;- mice_weights |&gt; \n  group_by(diet) |&gt; \n  summarize(xbar = mean(body_weight), s = sd(body_weight), n = n()) \nt_stat &lt;- with(stats, (xbar[2] - xbar[1])/sqrt(s[2]^2/n[2] + s[1]^2/n[1]))\nt_stat\n#&gt; [1] 9.34\n\nHere \\(t\\) is well over 3, so we don’t really need to compute the p-value 1-pnorm(t_stat) as we know it will be very small.\nNote that when \\(N\\) is not large enough, then the CLT does not apply. However, if the outcome data, in this case weight, follows a normal distribution, then \\(t\\) follows a t-distribution with \\(N_1+N_2-2\\) degrees of freedom. So the calculation of the p-value is the same except that we use pt instead of pnorm. Specifically, we use 1-pt(t_stat, with(stats, n[2]+n[1]-2).\nDifferences in means are commonly examined in the scientific studies. As a result this t-statistic is one of the most widely reported summaries. When used to determine if an observed difference is statistically significant, we refer to the procedure as “performing a t test”.\n\n\n\n\n\n\nIn the computation above, we computed the probability of t being as large as what we observed. However, when our interest spans both directions, for example, either an increase or decrease in weight, we need to compute the probability of t being as extreme as what we observe. The formula simply changes to using the absolute value: 1 - pnorm(abs(t-test)) or 1-pt(abs(t_stat), with(stats, n[2]+n[1]-2)."
   },
   {
     "objectID": "linear-models/treatment-effect-models.html#one-factor-design",
     "href": "linear-models/treatment-effect-models.html#one-factor-design",
-    "title": "16  Treatment effect models",
-    "section": "\n16.2 One factor design",
-    "text": "16.2 One factor design\nAlthough the t-test is useful for cases in which we only account for two treatments, it is common to have other variables affect our outcomes. Linear models permit hypothesis testing in more general situations. We start the description of the use linear models for estimating treatment effects by demonstrating how they can be used to perform t-tests.\nIf we assume that the weight distributions for both chow and high-fat diets are normally distributed, we can write the following linear model to represent the data:\n\\[\nY_i = \\beta_0 + \\beta_1 x_i + \\varepsilon_i\n\\] with \\(X_i\\) 1 if the \\(i\\)-th mice was fed the high-fat diet and 0 otherwise and the errors \\(\\varepsilon_i\\) independent and normally distributed with expected value 0 and standard deviation \\(\\sigma\\). Note that this mathematical formula looks exactly like the model we wrote out for the father-son heights. However, the fact that \\(x_i\\) is now 0 or 1 rather than a continuous variable, permits us to use it in this different context. In particular notice that now \\(\\beta_0\\) represents the population average height of the mice on the chow diet and \\(\\beta_0 + \\beta_1\\) represents the population average for the weight of the mice on the high-fat diet.\nA nice feature of this model is that \\(\\beta_1\\) represents the treatment effect of receiving the high-fat diet. If the null hypothesis that the high-fat diet has no effect can be quantified as \\(\\beta_1 = 0\\). We can then estimate \\(\\beta_1\\) and answer the question of weather or not the observed difference is real by computing the estimates being as large as it was under the null. So how do we estimate \\(\\beta_1\\) and a standard error for the estimate?\nA powerful characteristics of linear models is that we can can estimate the parameters \\(\\beta\\)s and their standard errors with the same LSE machinery:\n\nfit &lt;- lm(body_weight ~ diet, data = mice_weights)\n\nBecause diet is a factor with two entries, the lm function knows to fit the model above with a \\(x_i\\) a indicator variable. The summary function shows us the resulting estimate, standard error, and p-value:\n\ncoefficients(summary(fit))\n#&gt;             Estimate Std. Error t value Pr(&gt;|t|)\n#&gt; (Intercept)    31.54      0.386   81.74 0.00e+00\n#&gt; diethf          5.14      0.548    9.36 8.02e-20\n\nor using broom we can write:\n\nlibrary(broom)\ntidy(fit, conf.int = TRUE) |&gt; filter(term == \"diethf\")\n#&gt; # A tibble: 1 × 7\n#&gt;   term   estimate std.error statistic  p.value conf.low conf.high\n#&gt;   &lt;chr&gt;     &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;\n#&gt; 1 diethf     5.14     0.548      9.36 8.02e-20     4.06      6.21\n\nThe statistic computed here is the estimate divided by its standard error: \\(\\hat{beta}_1 / \\hat{\\mbox{SE}}(\\hat{beta}_1)\\). In the case for the simple one-factor model, we can show that this statistic is almost equivalent to the t-test. Intuitively it makes since both \\(\\hat{\\beta_1}\\) and the numerator of the t-test are estimates of the treatment effect. In fact, we can see that we obtain a number similar to the \\(t\\) computed in the previous section.\n\nc(coefficients(summary(fit))[2,3], t_stat)\n#&gt; [1] 9.36 9.34\n\nOne minor difference is that the linear model does not assume a different standard deviation for each population. Instead, both populations share \\(\\mbox{SD}(\\varepislon)\\) as a standard deviation. Note that, although we do not show how to do it with R here, we can redefine the linear model to have different standard errors for each group.\n\n\n\n\n\n\nIn the linear model description provided here we assumed \\(\\varepsilon\\) follows a normal distribution. This assumption permits us to show that the statistics formed by dividing estimates by their estimated standard errors follow t-distribution, which in turn permits us to estimate p-values or confidence intervals. However, note that we do not need this assumption to compute the expected value and standard error of the least squared estimates. Furthermore, if the number of observations in large enough, then the central limit theorem applies and we can obtain p-values and confidence intervals even without the normal distribution assumption."
+    "title": "17  Treatment effect models",
+    "section": "\n17.2 One factor design",
+    "text": "17.2 One factor design\nAlthough the t-test is useful for cases in which we compare two treatments, it is common to have other variables affect our outcomes. Linear models permit hypothesis testing in these more general situations. We start the description of the use of linear models for estimating treatment effects by demonstrating how they can be used to perform t-tests.\nIf we assume that the weight distributions for both chow and high-fat diets are normally distributed, we can write the following linear model to represent the data:\n\\[\nY_i = \\beta_0 + \\beta_1 x_i + \\varepsilon_i\n\\]\nwith \\(X_i\\) 1, if the \\(i\\)-th mice was fed the high-fat diet, and 0 otherwise, and the errors \\(\\varepsilon_i\\) independent and normally distributed with expected value 0 and standard deviation \\(\\sigma\\). Note that this mathematical formula looks exactly like the model we wrote out for the father-son heights. However, the fact that \\(x_i\\) is now 0 or 1 rather than a continuous variable, allows us to use it in this different context. In particular, notice that now \\(\\beta_0\\) represents the population average height of the mice on the chow diet and \\(\\beta_0 + \\beta_1\\) represents the population average for the weight of the mice on the high-fat diet.\nA nice feature of this model is that \\(\\beta_1\\) represents the treatment effect of receiving the high-fat diet. The null hypothesis that the high-fat diet has no effect can be quantified as \\(\\beta_1 = 0\\). To perform hypothesis testing on the effect of the high fat diet we can estimate \\(\\beta_1\\) and compute the probability of an estimates being as large as the observed when the null hypothesis is true. So how do we estimate \\(\\beta_1\\) and compute this probability?\nA powerful characteristic of linear models is that we can estimate the \\(\\beta\\)s and their standard errors with the same LSE machinery:\n\nfit &lt;- lm(body_weight ~ diet, data = mice_weights)\n\nBecause diet is a factor with two entries, the lm function knows to fit the linear model above with a \\(x_i\\) a indicator variable. The summary function shows us the resulting estimate, standard error, and p-value:\n\ncoefficients(summary(fit))\n#&gt;             Estimate Std. Error t value Pr(&gt;|t|)\n#&gt; (Intercept)    31.54      0.386   81.74 0.00e+00\n#&gt; diethf          5.14      0.548    9.36 8.02e-20\n\nUsing broom, we can write:\n\nlibrary(broom)\ntidy(fit, conf.int = TRUE) |&gt; filter(term == \"diethf\")\n#&gt; # A tibble: 1 × 7\n#&gt;   term   estimate std.error statistic  p.value conf.low conf.high\n#&gt;   &lt;chr&gt;     &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;\n#&gt; 1 diethf     5.14     0.548      9.36 8.02e-20     4.06      6.21\n\nThe statistic computed here is the estimate divided by its standard error: \\(\\hat{\\beta}_1 / \\hat{\\mbox{SE}}(\\hat{\\beta}_1)\\). In the case of the simple one-factor model, we can show that this statistic is almost equivalent to the t-statistics computed in the previous section:\n\nc(coefficients(summary(fit))[2,3], t_stat)\n#&gt; [1] 9.36 9.34\n\nIntuitively, it makes sense, as both \\(\\hat{\\beta_1}\\) and the numerator of the t-test are estimates of the treatment effect.\nThe one minor difference is that the linear model does not assume a different standard deviation for each population. Instead, both populations share \\(\\mbox{SD}(\\varepsilon)\\) as a standard deviation. Note that, although we don’t demonstrate it with R here, we can redefine the linear model to have different standard errors for each group.\n\n\n\n\n\n\nIn the linear model description provided here, we assumed \\(\\varepsilon\\) follows a normal distribution. This assumption permits us to show that the statistics formed by dividing estimates by their estimated standard errors follow t-distribution, which in turn allows us to estimate p-values or confidence intervals. However, note that we do not need this assumption to compute the expected value and standard error of the least squared estimates. Furthermore, if the number of observations is large enough, then the central limit theorem applies and we can obtain p-values and confidence intervals even without the normal distribution assumption for the errors."
   },
   {
     "objectID": "linear-models/treatment-effect-models.html#two-factor-designs",
     "href": "linear-models/treatment-effect-models.html#two-factor-designs",
-    "title": "16  Treatment effect models",
-    "section": "\n16.3 Two factor designs",
-    "text": "16.3 Two factor designs\nNote that this experiment included male and female mice, and male mice are known to be heavier. This explains why the residuals depend on the sex variable:\n\nboxplot(fit$residuals ~ mice_weights$sex)\n\n\n\n\n\n\n\nThis misspecification can have real implications since if more male mice received the high-fat diet, then this could explain the increase. Or if less received it, then we might underestimate the diet effect. Sex might be a confounder. Our model can certainly be improved.\nFrom examining the data:\n\nmice_weights |&gt; ggplot(aes(diet, log2(body_weight), fill = sex)) + geom_boxplot()\n\n\n\n\n\n\n\nwe see that there diet effect is observed for both sexes and that males are heavier than females. Although not nearly as obvious, it also appears the diet effect is stronger in males. A linear models that permits a different expected value four groups, 1) female on chow diet, 2) females on high-fat diet, 3) male on chow diet, and 4)males on high-fat diet,\n\\[\nY_i = \\beta_1 x_{i,1} + \\beta_2 x_{i,2}  + \\beta_3 x_{i,3}  + \\beta_4 x_{i,4}  + \\varepsilon_i\n\\] with the \\(x_i\\)s indicator variables for each of the four groups. However, with this representation, none of the \\(\\beta\\)s represent the effect of interest: the diet effect. Furthermore, we now are accounting for the possibility that the diet effect is different for males and females have a different, and can test that hypothesis as well.\nA powerful feature of linear models is that we can rewrite the model so that we still have a different expected value for each group, but the parameters represent effects we are interested. So, for example, in the representation\n\\[\nY_i = \\beta_0 + \\beta_1 x_{i,1}  + \\beta_2 x_{i,2}  + \\beta_3 x_{i,1} x_{i,2}  + \\varepsilon_i\n\\] with \\(x_{i,1}\\) and indicator that is one if you have the treatment and \\(x_{i,2}\\) an indicator that is one if you are male, the \\(\\beta_1\\) can be interpreted as the treatment effect for females, \\(\\beta_2\\) as the difference between males and females, and \\(\\beta_3\\) the added treatment effect for males. In Statistics, this last effect is referred to as an interaction effect. The \\(\\beta_0\\) is consider the baseline value which is the average weight of females on the chow diet.\nStatistical textbooks describes several other ways in which the model can be rewritten to obtain other types of interpretations. For example, we might want \\(\\beta_2\\) to represent the average treatment effect between females and males, rather that the female treatment effects. This is achieved by defining what contrasts we are interested.\nIn R we can specific this model using the following\n\nfit &lt;- lm(body_weight ~ diet*sex, data = mice_weights)\n\nThe * implies that the term that multiplies \\(x_{i,1}\\) and \\(x_{i,2}\\) should be included, along with the \\(x_{i,1}\\) and \\(x_{i,2}\\) terms.\n\ntidy(fit, conf.int = TRUE) |&gt; filter(!str_detect(term, \"Intercept\"))\n#&gt; # A tibble: 3 × 7\n#&gt;   term        estimate std.error statistic  p.value conf.low conf.high\n#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;\n#&gt; 1 diethf          3.88     0.624      6.22 8.02e-10    2.66       5.10\n#&gt; 2 sexM            7.53     0.627     12.0  1.27e-30    6.30       8.76\n#&gt; 3 diethf:sexM     2.66     0.891      2.99 2.91e- 3    0.912      4.41\n\nNote that the male effect is larger that the diet effect, and the diet effect is statistically significant for both sexes, with the males having a higher effect by between 1 and 4.5 grams.\nA common approach applied when more than one factor is thought to affect the measurement is to simply include an additive effect for each factor like this:\n\\[\nY_i = \\beta_0 + \\beta_1 x_{i,1}  + \\beta_2 x_{i,2}   + \\varepsilon_i\n\\]\nIn this model, the \\(\\beta_1\\) is a general diet effect that applies regardless of sex. In R we use the following code using a + instead of *:\n\nfit &lt;- lm(body_weight ~ diet + sex, data = mice_weights)\n\nBecause their a strong interaction effect, a diagnostic plots shows that the residuals are biased: the average negative for females on the diet and positive for the males on the diet, rather than 0.\n\nplot(fit, which = 1)\n\n\n\n\n\n\n\nScientific studies, particularly within epidemiology and social sciences, frequently omit interaction terms from models due to the high number of variables. Adding interactions necessitates numerous parameters, which, in extreme cases, may prevent the model from fitting. However, this approach assumes that the interaction terms are zero, which, if incorrect, can skew the results’ interpretation. Conversely, when this assumption is valid, models excluding interactions are simpler to interpret as parameters are typically viewed as the extent to which the outcome increases with the assigned treatment.\n\n\n\n\n\n\nLinear models are very flexible and can be applied in many contexts. For example, we can include many more factors than 2. We have just scratched the surface of how linear models can be used to estimate treatment effects. We highly recommend learning more about this through linear model textbooks and R manuals on using the lm, contrasts, and model.matrix functions."
+    "title": "17  Treatment effect models",
+    "section": "\n17.3 Two factor designs",
+    "text": "17.3 Two factor designs\nNote that this experiment included male and female mice, and male mice are known to be heavier. This explains why the residuals depend on the sex variable:\n\nboxplot(fit$residuals ~ mice_weights$sex)\n\n\n\n\n\n\n\nThis misspecification can have real implications; for instance, if more male mice received the high-fat diet, then this could explain the increase. Conversely, if fewer received it, we might underestimate the diet effect. Sex could be a confounder, indicating that our model can certainly be improved.\nFrom examining the data:\n\nmice_weights |&gt; ggplot(aes(diet, log2(body_weight), fill = sex)) + geom_boxplot()\n\n\n\n\n\n\n\nwe see that the diet effect is observed for both sexes and that males are heavier than females. Although not nearly as obvious, it also appears the diet effect is stronger in males.\nA linear model that permits a different expected value for the following four groups, 1) female on chow diet, 2) females on high-fat diet, 3) male on chow diet, and 4) males on high-fat diet, can be written like this:\n\\[\nY_i = \\beta_1 x_{i,1} + \\beta_2 x_{i,2}  + \\beta_3 x_{i,3}  + \\beta_4 x_{i,4}  + \\varepsilon_i\n\\]\nwith \\(x_{i,1},\\dots,x_{i,4}\\) indicator variables for each of the four groups. Note that with this representation we allow the diet effect to be different for males and females.\nHowever, with this representation, none of the \\(\\beta\\)s represent the effect of interest: the diet effect. A powerful feature of linear models is that we can rewrite the model so that the expected value for each group remains the same, but the parameters represent the effects we are interested in. So, for example, in the representation\n\\[\nY_i = \\beta_0 + \\beta_1 x_{i,1}  + \\beta_2 x_{i,2}  + \\beta_3 x_{i,1} x_{i,2}  + \\varepsilon_i\n\\]\nwith \\(x_{i,1}\\) an indicator that is 1 if individual \\(i\\) is on the high-fat diet \\(x_{i,2}\\) an indicator that is 1 if you are male, the \\(\\beta_1\\) is interpreted as the diet effect for females, \\(\\beta_2\\) as the average difference between males and females, and \\(\\beta_3\\) the difference in the diet effect between males and females. In statistics, \\(\\beta_3\\) is referred to as an interaction effect. The \\(\\beta_0\\) is considered the baseline value, which is the average weight of females on the chow diet.\nStatistical textbooks describe several other ways in which the model can be rewritten to obtain other types of interpretations. For example, we might want \\(\\beta_2\\) to represent the overall diet effect (the average between female and male effect) rather than the diet effect on females. This is achieved by defining what contrasts we are interested in.\nIn R, we can specific the linear model above using the following:\n\nfit &lt;- lm(body_weight ~ diet*sex, data = mice_weights)\n\nThe * implies that the term that multiplies \\(x_{i,1}\\) and \\(x_{i,2}\\) should be included, along with the \\(x_{i,1}\\) and \\(x_{i,2}\\) terms.\n\ntidy(fit, conf.int = TRUE) |&gt; filter(!str_detect(term, \"Intercept\"))\n#&gt; # A tibble: 3 × 7\n#&gt;   term        estimate std.error statistic  p.value conf.low conf.high\n#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;\n#&gt; 1 diethf          3.88     0.624      6.22 8.02e-10    2.66       5.10\n#&gt; 2 sexM            7.53     0.627     12.0  1.27e-30    6.30       8.76\n#&gt; 3 diethf:sexM     2.66     0.891      2.99 2.91e- 3    0.912      4.41\n\nNote that the male effect is larger that the diet effect, and the diet effect is statistically significant for both sexes, with diet affecting males more by between 1 and 4.5 grams.\nA common approach applied when more than one factor is thought to affect the measurement is to simply include an additive effect for each factor, like this:\n\\[\nY_i = \\beta_0 + \\beta_1 x_{i,1}  + \\beta_2 x_{i,2}   + \\varepsilon_i\n\\]\nIn this model, the \\(\\beta_1\\) is a general diet effect that applies regardless of sex. In R, we use the following code, employing a + instead of *:\n\nfit &lt;- lm(body_weight ~ diet + sex, data = mice_weights)\n\nNote that this model does not account for the difference in diet effect between males and females. Diagnostic plots would reveal this deficiency by showing that the residuals are biased: they are, on average, negative for females on the diet and positive for males on the diet, rather than being centered around 0.\n\nplot(fit, which = 1)\n\n\n\n\n\n\n\nScientific studies, particularly within epidemiology and social sciences, frequently omit interaction terms from models due to the high number of variables. Adding interactions necessitates numerous parameters, which in extreme cases may prevent the model from fitting. However, this approach assumes that the interaction terms are zero, and if incorrect, it can skew the interpretation of the results. Conversely, when this assumption is valid, models excluding interactions are simpler to interpret, as parameters are typically viewed as the extent to which the outcome increases with the assigned treatment.\n\n\n\n\n\n\nLinear models are highly flexible and applicable in many contexts. For example, we can include many more factors than just 2. We have only just scratched the surface of how linear models can be used to estimate treatment effects. We highly recommend learning more about this by exploring linear model textbooks and R manuals that cover the use of functions such as lm, contrasts, and model.matrix."
+  },
+  {
+    "objectID": "linear-models/treatment-effect-models.html#contrasts",
+    "href": "linear-models/treatment-effect-models.html#contrasts",
+    "title": "17  Treatment effect models",
+    "section": "\n17.4 Contrasts",
+    "text": "17.4 Contrasts\nIn the examples we have examined, each treatment had only two groups: diet had chow/high-fat, and sex had female/male. However, variables of interest often have more than one level. For example, we might have tested a third diet on the mice. In statistics textbooks, these variables are referred to as a factor, and the groups in each factor are called its levels.\nWhen a factor is included in the formula, the default behavior for lm is to define the intercept term as the expected value for the first level, and the other coefficient are to represent the difference, or contrast, between the other levels and first. We can see when we estimate the sex effect with lm like this:\n\nfit &lt;- lm(body_weight ~ sex, data = mice_weights)\ncoefficients(fit)\n#&gt; (Intercept)        sexM \n#&gt;       29.76        8.82\n\nTo recover the expected mean for males, we can simply add the two coefficients:\n\nsum(fit$coefficients[1:2])\n#&gt; [1] 38.6\n\nThe package emmeans simplifies the calculation and also calculates standard errors:\n\nlibrary(emmeans)\nemmeans(fit, ~sex)\n#&gt;  sex emmean    SE  df lower.CL upper.CL\n#&gt;  F     29.8 0.339 778     29.1     30.4\n#&gt;  M     38.6 0.346 778     37.9     39.3\n#&gt; \n#&gt; Confidence level used: 0.95\n\nNow, what if we really didn’t want to define a reference level? What if we wanted a parameter to represent the difference from each group to the overall mean? Can we write a model like this:\n\\[\nY_i = \\beta_0 + \\beta_1 x_{i,1} + \\beta_2 x_{i,2} + \\varepsilon_i\n\\] with \\(x_{i,1} = 1\\), if observation \\(i\\) is female and 0 otherwise, and \\(x_{i,2}=1\\), if observation \\(i\\) is male and 0 otherwise?\nUnfortunately, this representation has a problem. Note that the mean for females and males are represented by \\(\\beta_0 + \\beta_1\\) and \\(\\beta_0 + \\beta_2\\), respectively. This is a problem because the expected value for each group is just one number, say \\(\\mu_f\\) and \\(\\mu_m\\), and there is an infinite number of ways \\(\\beta_0 + \\beta_1 = \\mu_f\\) and \\(\\beta_0 +\\beta_2 = \\mu_m\\) (three unknowns with two equations). This implies that we can’t obtain a unique least squares estimates. In statistics, we say the model, or parameters, are unidentifiable. The default behavior in R solves this problem by requiring \\(\\beta_1 = 0\\), forcing \\(\\beta_0 = \\mu_m\\), which permits us to solve the system of equations.\nKeep in mind that this is not the only constraint that permits estimation of the parameters. Any linear constraint will do as it adds a third equation to our system. A widely used constraint is to require \\(\\beta_1 + \\beta_2 = 0\\). To achieve this in R, we can use the argument contrast in the following way:\n\nfit &lt;- lm(body_weight ~ sex, data = mice_weights, \n          contrasts = list(sex = contr.sum))\ncoefficients(fit)\n#&gt; (Intercept)        sex1 \n#&gt;       34.17       -4.41\n\nWe see that the intercept is now larger, reflecting the overall mean rather than just the mean for females. The other coefficient, \\(\\beta_1\\), represents the contrast between females and the overall mean in our model. The coefficient for men is not shown because it is redundant: \\(\\beta_1= -\\beta_2\\).\nIf we want to see all the estimates, the emmeans package also makes the calculations for us:\n\ncontrast(emmeans(fit, ~sex))\n#&gt;  contrast estimate    SE  df t.ratio p.value\n#&gt;  F effect    -4.41 0.242 778 -18.200  &lt;.0001\n#&gt;  M effect     4.41 0.242 778  18.200  &lt;.0001\n#&gt; \n#&gt; P value adjustment: fdr method for 2 tests\n\nThe use of this alternative constraint is more practical when a factor has more than one level, and choosing a baseline becomes less convenient. Furthermore, we might be more interested in the variance of the coefficients rather than the contrasts between groups and the reference level.\nAs an example, consider that the mice in our dataset are actually from several generations:\n\ntable(mice_weights$gen)\n#&gt; \n#&gt;   4   7   8   9  11 \n#&gt;  97 195 193  97 198\n\nTo estimate the variability due to the different generations, a convenient model is:\n\\[\nY_i = \\beta_0 + \\sum_{j=1}^J \\beta_j x_{i,j} + \\varepsilon_i\n\\]\nwith \\(x_{i,j}\\) indicator variables: \\(x_{i,j}=1\\) if mouse \\(i\\) is in level \\(j\\) and 0 otherwise, \\(J\\) representing the number of levels, in our example 5 generations, and the level effects constrained with\n\\[\n\\frac{1}{J} \\sum_{j=1}^J \\beta_j = 0 \\implies \\sum_{j=1}^J \\beta_j = 0.\n\\]\nThis constraint makes the model identifiable and also allows us to quantify the variability due to generations with:\n\\[\n\\sigma^2_{\\text{gen}} \\equiv \\frac{1}{J}\\sum_{j=1}^J \\beta_j^2\n\\]\nWe can see the estimated coefficients using the following:\n\nfit &lt;- lm(body_weight ~ gen,  data = mice_weights, \n          contrasts = list(gen = contr.sum))\ncontrast(emmeans(fit, ~gen)) \n#&gt;  contrast     estimate    SE  df t.ratio p.value\n#&gt;  gen4 effect    -0.122 0.705 775  -0.174  0.8620\n#&gt;  gen7 effect    -0.812 0.542 775  -1.497  0.3370\n#&gt;  gen8 effect    -0.113 0.544 775  -0.207  0.8620\n#&gt;  gen9 effect     0.149 0.705 775   0.212  0.8620\n#&gt;  gen11 effect    0.897 0.540 775   1.663  0.3370\n#&gt; \n#&gt; P value adjustment: fdr method for 5 tests\n\nIn the next section, we briefly describe a technique useful to study the variability associated with this factor."
   },
   {
-    "objectID": "linear-models/treatment-effect-models.html#analysis-of-variance",
-    "href": "linear-models/treatment-effect-models.html#analysis-of-variance",
-    "title": "16  Treatment effect models",
-    "section": "\n16.4 Analysis of variance",
-    "text": "16.4 Analysis of variance\nIn the example we have examined, each treatment had only two levels: diet and chow and high-fat and sex had female and male. However, often we have variables of interest that have more than one level. For example, we might have tested a third diet on the mice. In statistics textbooks these variables are referred to as factor. In these cases it is common to want to know rather than the effect of each levels of the factor, a more general quantification regarding the variability across the levels. Analysis of variances or ANOVA does just this. The summary used to quantify the variability of a factor is the mean squared error of the estimated effects of each level.\nAs an example, consider that the mice in our dataset are actually from several generations:\n\ntable(mice_weights$gen)\n#&gt; \n#&gt;   4   7   8   9  11 \n#&gt;  97 195 193  97 198\n\nWe can fit a linear model that fits an effect for each of these generations along with diet and sex model previously fit:\n\nfit &lt;- lm(body_weight ~ diet * sex + gen,  data = mice_weights)\n\nWe can then perform an analysis of variance with the R aov function:\n\nsummary(aov(fit))\n#&gt;              Df Sum Sq Mean Sq F value Pr(&gt;F)    \n#&gt; diet          1   5143    5143  133.58 &lt;2e-16 ***\n#&gt; sex           1  15260   15260  396.33 &lt;2e-16 ***\n#&gt; gen           4    295      74    1.91 0.1061    \n#&gt; diet:sex      1    349     349    9.06 0.0027 ** \n#&gt; Residuals   772  29725      39                   \n#&gt; ---\n#&gt; Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\nThis analysis shows that the largest variation is explained by sex and then diet. The generation factor explains very little variation in comparison and is not found to be statistically significant.\n\n\n\n\n\n\nIn this book, we do not provide the details for how we compute this p-value. There are several books on analysis of variance and textbooks on linear models often include chapters on this topic. Those interested in learning more about these topics can consult these textbooks."
+    "objectID": "linear-models/treatment-effect-models.html#sec-anova",
+    "href": "linear-models/treatment-effect-models.html#sec-anova",
+    "title": "17  Treatment effect models",
+    "section": "\n17.5 Analysis of variance (ANOVA)",
+    "text": "17.5 Analysis of variance (ANOVA)\nWhen a factor has more than one level, it is common to want to determine if there is significant variability across the levels rather than specific difference between any given pair of levels. Analysis of variances (ANOVA) provides tools to do this.\nANOVA provides an estimate of \\(\\sigma^2_{\\text{gen}}\\) and a statistical test for the null hypothesis that the factor contributes no variability: \\(\\sigma^2_{\\text{gen}} =0\\).\nOnce a linear model is fit using one or more factors, the aov function can be used to perform ANOVA. Specifically, the estimate of the factor variability is computed along with a statistic that can be used for hypothesis testing:\n\nsummary(aov(fit))\n#&gt;              Df Sum Sq Mean Sq F value Pr(&gt;F)\n#&gt; gen           4    294    73.5    1.13   0.34\n#&gt; Residuals   775  50479    65.1\n\nKeep in mind that if given a model formula, aov will fit the model:\n\nsummary(aov(body_weight ~ gen, data = mice_weights))\n\nWe do not need to specify the constraint because ANOVA needs to constrain the sum to be 0 for the results to be interpretable.\nThis analysis indicates that generation is not statistically significant.\n\n\n\n\n\n\nWe do not include many details, for example, on how the summary statistics and p-values shown by aov are defined and motivated. There are several books dedicated to the analysis of variance, and textbooks on linear models often include chapters on this topic. Those interested in learning more about these topics can consult one of these textbooks.\n\n\n\n\n17.5.1 Multiple factors\nANOVA was developed to analyze agricultural data, which typically included several factors such as fertilizers, blocks of lands, and plant breeds.\nNote that we can perform ANOVA with multiple factors:\n\nsummary(aov(body_weight ~ sex + diet + gen,  data = mice_weights))\n#&gt;              Df Sum Sq Mean Sq F value Pr(&gt;F)    \n#&gt; sex           1  15165   15165  389.80 &lt;2e-16 ***\n#&gt; diet          1   5238    5238  134.64 &lt;2e-16 ***\n#&gt; gen           4    295      74    1.89   0.11    \n#&gt; Residuals   773  30074      39                   \n#&gt; ---\n#&gt; Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\nThis analysis suggests that sex is the biggest source of variability, which is consistent with previously made exploratory plots.\n\n\n\n\n\n\nOne of the key aspects of ANOVA (Analysis of Variance) is its ability to decompose the total variance in the data, represented by \\(\\sum_{i=1}^n Y_i^2\\), into individual contributions attributable to each factor in the study. However, for the mathematical underpinnings of ANOVA to be valid, the experimental design must be balanced. This means that for every level of any given factor, there must be an equal representation of the levels of all other factors. In our study involving mice, the design is unbalanced, requiring a cautious approach in the interpretation of the ANOVA results.\n\n\n\n\n17.5.2 Array representation\nWhen the model includes more than one factor, writing down linear models can become cumbersome. For example, in our two factor model, we would have to include indicator variables for both factors:\n\\[\nY_i = \\beta_0 + \\sum_{j=1}^J \\beta_j x_{i,j} + \\sum_{k=1}^K \\beta_{J+k} x_{i,J+k} + \\varepsilon_i \\mbox{ with }\\sum_{j=1}^J \\beta_j=0 \\mbox{ and } \\sum_{k=1}^K \\beta_{J+k} = 0,\n\\]\nthe \\(x_{i,1},\\dots,x_{i,J}\\) indicator functions for the \\(J\\) levels in the first factor and \\(x_{i,J+1},\\dots,x_{i,J+K}\\) indicator functions for the \\(K\\) levels in the second factor.\nAn alternative approach widely used in ANOVA to avoid indicators variables, is to save the data in an array, using different Greek letters to denote factors and indices to denote levels:\n\\[\nY_{i,j,k} = \\mu + \\alpha_j + \\beta_k + \\varepsilon_{i,j,k}\n\\]\nwith \\(\\mu\\) the overall mean, \\(\\alpha_j\\) the effect of level \\(j\\) in the first factor, and \\(\\beta_k\\) the effect of level \\(k\\) in the second factor. The constraint can now be written as:\n\\[\n\\sum_{j=1}^J \\alpha_j = 0 \\text{ and } \\sum_{k=1}^K \\beta_k = 0\n\\]\nThis notation lends itself to estimating the effects by computing means across dimensions of the array."
   },
   {
     "objectID": "linear-models/treatment-effect-models.html#exercises",
     "href": "linear-models/treatment-effect-models.html#exercises",
-    "title": "16  Treatment effect models",
-    "section": "\n16.5 Exercises",
-    "text": "16.5 Exercises\n1. Once you fit a model, the estimate of the standard error \\(\\sigma\\) can be obtained like this:\n\nfit &lt;- lm(body_weight ~ diet, data = mice_weights)\nsummary(fit)$sigma\n\nCompute the estimate of \\(\\sigma\\) using the model that includes just diet and a model that accounts for sex. Are the estimates the same? If not, why not?\n2. One of the assumption of the linear model fit by lm is that the standard deviation of the errors \\(\\varepsilon_i\\) is equal for all \\(i\\). This implies it does not depend on the expected value. Group the mice by their weight like this:\n\nbreaks &lt;- with(mice_weights, seq(min(body_weight), max(body_weight), 1))\ndat &lt;- mutate(mice_weights, group = cut(body_weight, breaks, include_lowest = TRUE))\n\nCompute the average and standard deviation for groups having more than 10 observations and use data exploration to see if this assumption holds?\n3. The dataset also includes a variable indicating which litter the mice came from. Create a boxplot showing weights by litter. Use faceting to make separate plots for each diet and sex combination.\n\nmice_weights |&gt; ggplot(aes(litter, body_weight)) + geom_boxplot() + facet_grid(sex~diet)\n\n\n\n\n\n\n\n4. Use a linear model to test for a litter effect. Account for sex and diet. Use ANOVA to compare the variability explained by litter to other factors.\n5. The mouse_weights data includes two other outcomes: bone density and percent fat. Make a boxplot showing bone density by sex and diet. Compare what the visualizations shows for the diet effect by sex.\n\nFit a linear model and test for the diet effect on bone density separately for each sex. Note that the diet effect is statistically significant for females but not for males. Then fit the model to the entire dataset that includes diet, sex and their interaction. Note that the diet effect is significant, yet the interaction effect is not. Explain how this can happen? Hint: To fit a model to the entire dataset that fit a separate effect for males and females you can use the formula ~ sex + diet:sex\n\n\n7. In Chapter Chapter 10, we talked about pollster bias. We used visualization to motivate the presence of such bias. Here we will give it a more rigorous treatment. We will consider two pollsters that conducted daily polls. We will look at national polls for the month before the election.\n\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(pollster %in% c(\"Rasmussen Reports/Pulse Opinion Research\",\n                         \"The Times-Picayune/Lucid\") &\n           enddate &gt;= \"2016-10-15\" &\n           state == \"U.S.\") |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100) \n\nWe want to answer the question: is there a poll bias? Make a plot showing the spreads for each poll.\n8. The data does seem to suggest there is a difference. However, these data are subject to variability. Perhaps the differences we observe are due to chance.\nThe urn model theory says nothing about pollster effect. Under the urn model, both pollsters have the same expected value: the election day difference, that we call \\(\\mu\\).\nTo answer the question “is there an urn model?”, we will model the observed data \\(Y_{i,j}\\) in the following way:\n\\[\nY_{i,j} = \\mu + b_i + \\varepsilon_{i,j}\n\\]\nwith \\(i=1,2\\) indexing the two pollsters, \\(b_i\\) the bias for pollster \\(i\\) and \\(\\varepsilon_ij\\) poll to poll chance variability. We assume the \\(\\varepsilon\\) are independent from each other, have expected value \\(0\\) and standard deviation \\(\\sigma_i\\) regardless of \\(j\\).\nWhich of the following best represents our question?\n\nIs \\(\\varepsilon_{i,j}\\) = 0?\nHow close are the \\(Y_{i,j}\\) to \\(\\mu\\)?\nIs \\(b_1 \\neq b_2\\)?\nAre \\(b_1 = 0\\) and \\(b_2 = 0\\) ?\n\n9. In the right side of this model only \\(\\varepsilon_{i,j}\\) is a random variable. The other two are constants. What is the expected value of \\(Y_{1,j}\\)?\n10. Suppose we define \\(\\bar{Y}_1\\) as the average of poll results from the first poll, \\(Y_{1,1},\\dots,Y_{1,N_1}\\) with \\(N_1\\) the number of polls conducted by the first pollster:\n\npolls |&gt; \n  filter(pollster==\"Rasmussen Reports/Pulse Opinion Research\") |&gt; \n  summarize(N_1 = n())\n\nWhat is the expected values \\(\\bar{Y}_1\\)?\n11. What is the standard error of \\(\\bar{Y}_1\\) ?\n12. Suppose we define \\(\\bar{Y}_2\\) as the average of poll results from the first poll, \\(Y_{2,1},\\dots,Y_{2,N_2}\\) with \\(N_2\\) the number of polls conducted by the first pollster. What is the expected value \\(\\bar{Y}_2\\)?\n13. What is the standard error of \\(\\bar{Y}_2\\) ?\n14. Using what we learned by answering the questions above, what is the expected value of \\(\\bar{Y}_{2} - \\bar{Y}_1\\)?\n15. Using what we learned by answering the questions above, what is the standard error of \\(\\bar{Y}_{2} - \\bar{Y}_1\\)?\n16. The answer to the question above depends on \\(\\sigma_1\\) and \\(\\sigma_2\\), which we don’t know. We learned that we can estimate these with the sample standard deviation. Write code that computes these two estimates.\n17. What does the CLT tell us about the distribution of \\(\\bar{Y}_2 - \\bar{Y}_1\\)?\n\nNothing because this is not the average of a sample.\nBecause the \\(Y_{ij}\\) are approximately normal, so are the averages.\nNote that \\(\\bar{Y}_2\\) and \\(\\bar{Y}_1\\) are sample averages, so if we assume \\(N_2\\) and \\(N_1\\) are large enough, each is approximately normal. The difference of normally distributed variables is also normally distributed.\nThe data are not 0 or 1, so CLT does not apply.\n\n18. We have constructed a random variable that has expected value \\(b_2 - b_1\\), the pollster bias difference. If our model holds, then this random variable has an approximately normal distribution and we know its standard error. The standard error depends on \\(\\sigma_1\\) and \\(\\sigma_2\\), but we can plug the sample standard deviations we computed above. We started off by asking: is \\(b_2 - b_1\\) different from 0? Use all the information we have learned above to construct a 95% confidence interval for the difference \\(b_2\\) and \\(b_1\\).\n19. The confidence interval tells us there is relatively strong pollster effect resulting in a difference of about 5%. Random variability does not seem to explain it. We can compute a p-value to relay the fact that chance does not explain it. What is the p-value?\n20. The statistic formed by dividing our estimate of \\(b_2-b_1\\) by its estimated standard error:\n\\[\n\\frac{\\bar{Y}_2 - \\bar{Y}_1}{\\sqrt{s_2^2/N_2 + s_1^2/N_1}}\n\\]\nis the t-statistic. Now notice that we have more than two pollsters. We can also test for pollster effect using all pollsters, not just two. The idea is to compare the variability across polls to variability within polls.\nFor this exercise, create a new table:\n\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(enddate &gt;= \"2016-10-15\" &\n           state == \"U.S.\") |&gt;\n  group_by(pollster) |&gt;\n  filter(n() &gt;= 5) |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100) |&gt;\n  ungroup()\n\nCompute the average and standard deviation for each pollster and examine the variability across the averages and how it compares to the variability within the pollsters, summarized by the standard deviation."
+    "title": "17  Treatment effect models",
+    "section": "\n17.6 Exercises",
+    "text": "17.6 Exercises\n1. Once you fit a model, the estimate of the standard error \\(\\sigma\\) can be obtained as follows:\n\nfit &lt;- lm(body_weight ~ diet, data = mice_weights)\nsummary(fit)$sigma\n\nCompute the estimate of \\(\\sigma\\) using both the model that includes only diet and a model that accounts for sex. Are the estimates the same? If not, why not?\n2. One of the assumption of the linear model fit by lm is that the standard deviation of the errors \\(\\varepsilon_i\\) is equal for all \\(i\\). This implies that it does not depend on the expected value. Group the mice by their weight like this:\n\nbreaks &lt;- with(mice_weights, seq(min(body_weight), max(body_weight), 1))\ndat &lt;- mutate(mice_weights, group = cut(body_weight, breaks, include_lowest = TRUE))\n\nCompute the average and standard deviation for groups with more than 10 observations and use data exploration to verify if this assumption holds.\n3. The dataset also includes a variable indicating which litter the mice came from. Create a boxplot showing weights by litter. Use faceting to make separate plots for each diet and sex combination.\n4. Use a linear model to test for a litter effect, taking into account sex and diet. Use ANOVA to compare the variability explained by litter with that of other factors.\n5. The mouse_weights data includes two other outcomes: bone density and percent fat. Create a boxplot illustrating bone density by sex and diet. Compare what the visualizations reveal about the diet effect by sex.\n\nFit a linear model and conduct a separate test for the diet effect on bone density for each sex. Note that the diet effect is statistically significant for females but not for males. Then fit the model to the entire dataset that includes diet, sex and their interaction. Notice that the diet effect is significant, yet the interaction effect is not. Explain how this can happen. Hint: To fit a model to the entire dataset with a separate effect for males and females, you can use the formula ~ sex + diet:sex\n\n\n7. In Chapter 11, we talked about pollster bias and used visualization to motivate the presence of such bias. Here we will give it a more rigorous treatment. We will consider two pollsters that conducted daily polls. We will look at national polls for the month before the election:\n\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(pollster %in% c(\"Rasmussen Reports/Pulse Opinion Research\",\n                         \"The Times-Picayune/Lucid\") &\n           enddate &gt;= \"2016-10-15\" &\n           state == \"U.S.\") |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100) \n\nWe want to answer the question: is there a pollster bias? Make a plot showing the spreads for each pollster.\n8. The data does seem to suggest there is a difference. However, these data are subject to variability. Perhaps the differences we observe are due to chance.\nThe urn model theory says nothing about pollster effect. Under the urn model, both pollsters have the same expected value: the election day difference, that we call \\(\\mu\\).\nTo answer the question “is there an urn model?” we will model the observed data \\(Y_{i,j}\\) in the following way:\n\\[\nY_{i,j} = \\mu + b_i + \\varepsilon_{i,j}\n\\]\nwith \\(i=1,2\\) indexing the two pollsters, \\(b_i\\) the bias for pollster \\(i\\), and \\(\\varepsilon_ij\\) poll to poll chance variability. We assume the \\(\\varepsilon\\) are independent from each other, have expected value \\(0\\), and standard deviation \\(\\sigma_i\\) regardless of \\(j\\).\nWhich of the following best represents our question?\n\nIs \\(\\varepsilon_{i,j}\\) = 0?\nHow close are the \\(Y_{i,j}\\) to \\(\\mu\\)?\nIs \\(b_1 \\neq b_2\\)?\nAre \\(b_1 = 0\\) and \\(b_2 = 0\\) ?\n\n9. On the right side of this model, only \\(\\varepsilon_{i,j}\\) is a random variable; the other two are constants. What is the expected value of \\(Y_{1,j}\\)?\n10. Suppose we define \\(\\bar{Y}_1\\) as the average of poll results from the first poll, \\(Y_{1,1},\\dots,Y_{1,N_1}\\), where \\(N_1\\) is the number of polls conducted by the first pollster:\n\npolls |&gt; \n  filter(pollster==\"Rasmussen Reports/Pulse Opinion Research\") |&gt; \n  summarize(N_1 = n())\n\nWhat is the expected values \\(\\bar{Y}_1\\)?\n11. What is the standard error of \\(\\bar{Y}_1\\) ?\n12. Suppose we define \\(\\bar{Y}_2\\) as the average of poll results from the first poll, \\(Y_{2,1},\\dots,Y_{2,N_2}\\), where \\(N_2\\) is the number of polls conducted by the first pollster. What is the expected value \\(\\bar{Y}_2\\)?\n13. What is the standard error of \\(\\bar{Y}_2\\) ?\n14. Using what we learned by answering the questions above, what is the expected value of \\(\\bar{Y}_{2} - \\bar{Y}_1\\)?\n15. Using what we learned by answering the questions above, what is the standard error of \\(\\bar{Y}_{2} - \\bar{Y}_1\\)?\n16. The answer to the question above depends on \\(\\sigma_1\\) and \\(\\sigma_2\\), which we don’t know. We learned that we can estimate these with the sample standard deviation. Write code that computes these two estimates.\n17. What does the CLT tell us about the distribution of \\(\\bar{Y}_2 - \\bar{Y}_1\\)?\n\nNothing because this is not the average of a sample.\nBecause the \\(Y_{ij}\\) are approximately normal, so are the averages.\nNote that \\(\\bar{Y}_2\\) and \\(\\bar{Y}_1\\) are sample averages, so if we assume \\(N_2\\) and \\(N_1\\) are large enough, each is approximately normal. The difference of normally distributed variables is also normally distributed.\nThe data are not 0 or 1, so CLT does not apply.\n\n18. We have constructed a random variable that has an expected value of \\(b_2 - b_1\\), representing the difference in pollster bias. If our model holds, then this random variable has an approximately normal distribution, and we know its standard error. The standard error depends on \\(\\sigma_1\\) and \\(\\sigma_2\\), but we can plug the sample standard deviations we computed above. We began by asking: is \\(b_2 - b_1\\) different from 0? Using all the information we have gathered above, construct a 95% confidence interval for the difference \\(b_2 - b_1\\).\n19. The confidence interval tells us there is relatively strong pollster effect resulting in a difference of about 5%. Random variability does not seem to explain it. We can compute a p-value to relay the fact that chance does not explain it. What is the p-value?\n20. The statistic formed by dividing our estimate of \\(b_2-b_1\\) by its estimated standard error:\n\\[\n\\frac{\\bar{Y}_2 - \\bar{Y}_1}{\\sqrt{s_2^2/N_2 + s_1^2/N_1}}\n\\]\nis the t-statistic. Now notice that we have more than two pollsters. We can also test for pollster effect using all pollsters, not just two. The idea is to compare the variability across polls to variability within polls.\nFor this exercise, create a new table:\n\npolls &lt;- polls_us_election_2016 |&gt; \n  filter(enddate &gt;= \"2016-10-15\" &\n           state == \"U.S.\") |&gt;\n  group_by(pollster) |&gt;\n  filter(n() &gt;= 5) |&gt; \n  mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100) |&gt;\n  ungroup()\n\nCompute the average and standard deviation for each pollster and examine the variability across the averages. Compare this to the variability within the pollsters, summarized by the standard deviation."
   },
   {
     "objectID": "linear-models/association-tests.html#case-study-funding-success-rates",
     "href": "linear-models/association-tests.html#case-study-funding-success-rates",
-    "title": "17  Association tests",
-    "section": "\n17.1 Case study: funding success rates",
-    "text": "17.1 Case study: funding success rates\nA 2014 PNAS paper1 analyzed success rates from funding agencies in the Netherlands and concluded that their:\n\nresults reveal gender bias favoring male applicants over female applicants in the prioritization of their “quality of researcher” (but not “quality of proposal”) evaluations and success rates, as well as in the language use in instructional and evaluation materials.\n\nThe main evidence for this conclusion comes down to a comparison of the percentages. Table S1 in the paper includes the information we need. Here are the three columns showing the overall outcomes:\n\nlibrary(tidyverse)\nlibrary(dslabs)\nresearch_funding_rates |&gt; select(discipline, applications_total, \n                                  success_rates_total) |&gt; head()\n#&gt;           discipline applications_total success_rates_total\n#&gt; 1  Chemical sciences                122                26.2\n#&gt; 2  Physical sciences                174                20.1\n#&gt; 3            Physics                 76                26.3\n#&gt; 4         Humanities                396                16.4\n#&gt; 5 Technical sciences                251                17.1\n#&gt; 6  Interdisciplinary                183                15.8\n\nWe have these values for each gender:\n\nnames(research_funding_rates)\n#&gt;  [1] \"discipline\"          \"applications_total\"  \"applications_men\"   \n#&gt;  [4] \"applications_women\"  \"awards_total\"        \"awards_men\"         \n#&gt;  [7] \"awards_women\"        \"success_rates_total\" \"success_rates_men\"  \n#&gt; [10] \"success_rates_women\"\n\nWe can compute the totals that were successful and the totals that were not as follows:\n\ntotals &lt;- research_funding_rates |&gt; \n  select(-discipline) |&gt; \n  summarize_all(sum) |&gt;\n  summarize(yes_men = awards_men, \n            no_men = applications_men - awards_men, \n            yes_women = awards_women, \n            no_women = applications_women - awards_women) \n\nSo we see that a larger percent of men than women received awards:\n\ntotals |&gt; summarize(percent_men = yes_men/(yes_men+no_men),\n                    percent_women = yes_women/(yes_women+no_women))\n#&gt;   percent_men percent_women\n#&gt; 1       0.177         0.149\n\nBut could this be due just to random variability? Here we learn how to perform inference for this type of data."
+    "title": "18  Association tests",
+    "section": "\n18.1 Case study: Funding success rates",
+    "text": "18.1 Case study: Funding success rates\nA 2014 PNAS paper1 analyzed success rates from funding agencies in the Netherlands and concluded that their:\n\nresults reveal gender bias favoring male applicants over female applicants in the prioritization of their “quality of researcher” (but not “quality of proposal”) evaluations and success rates, as well as in the language use in instructional and evaluation materials.\n\nThe main evidence supporting this conclusion is based on a comparison of the percentages. Table S1 in the paper includes the information we need. Here are the three columns showing the overall outcomes:\n\nlibrary(tidyverse)\nlibrary(dslabs)\nresearch_funding_rates |&gt; select(discipline, applications_total, \n                                  success_rates_total) |&gt; head()\n#&gt;           discipline applications_total success_rates_total\n#&gt; 1  Chemical sciences                122                26.2\n#&gt; 2  Physical sciences                174                20.1\n#&gt; 3            Physics                 76                26.3\n#&gt; 4         Humanities                396                16.4\n#&gt; 5 Technical sciences                251                17.1\n#&gt; 6  Interdisciplinary                183                15.8\n\nWe have these values for each gender:\n\nnames(research_funding_rates)\n#&gt;  [1] \"discipline\"          \"applications_total\"  \"applications_men\"   \n#&gt;  [4] \"applications_women\"  \"awards_total\"        \"awards_men\"         \n#&gt;  [7] \"awards_women\"        \"success_rates_total\" \"success_rates_men\"  \n#&gt; [10] \"success_rates_women\"\n\nWe can compute the totals that were successful and the totals that were not as follows:\n\ntotals &lt;- research_funding_rates |&gt; \n  select(-discipline) |&gt; \n  summarize_all(sum) |&gt;\n  summarize(yes_men = awards_men, \n            no_men = applications_men - awards_men, \n            yes_women = awards_women, \n            no_women = applications_women - awards_women) \n\nSo we see that a larger percent of men than women received awards:\n\ntotals |&gt; summarize(percent_men = yes_men/(yes_men + no_men),\n                    percent_women = yes_women/(yes_women + no_women))\n#&gt;   percent_men percent_women\n#&gt; 1       0.177         0.149\n\nBut could this be due just to random variability? Here we learn how to perform inference for this type of data."
   },
   {
     "objectID": "linear-models/association-tests.html#lady-tasting-tea",
     "href": "linear-models/association-tests.html#lady-tasting-tea",
-    "title": "17  Association tests",
-    "section": "\n17.2 Lady Tasting Tea",
-    "text": "17.2 Lady Tasting Tea\nR.A. Fisher2 was one of the first to formalize hypothesis testing. The “Lady Tasting Tea” is one of the most famous examples.\nThe story is as follows: an acquaintance of Fisher’s claimed that she could tell if milk was added before or after tea was poured. Fisher was skeptical. He designed an experiment to test this claim. He gave her four pairs of cups of tea: one with milk poured first, the other after. The order was randomized. The null hypothesis here is that she is guessing. Fisher derived the distribution for the number of correct picks on the assumption that the choices were random and independent.\nAs an example, suppose she picked 3 out of 4 correctly. Do we believe she has a special ability? The basic question we ask is: if the tester is actually guessing, what are the chances that she gets 3 or more correct? Just as we have done before, we can compute a probability under the null hypothesis that she is guessing 4 of each. Under this null hypothesis, we can think of this particular example as picking 4 balls out of an urn with 4 blue (correct answer) and 4 red (incorrect answer) balls. Remember, she knows that there are four before tea and four after.\nUnder the null hypothesis that she is simply guessing, each ball has the same chance of being picked. We can then use combinations to figure out each probability. The probability of picking 3 is \\(\\binom{4}{3} \\binom{4}{1} / \\binom{8}{4} = 16/70\\). The probability of picking all 4 correct is \\(\\binom{4}{4} \\binom{4}{0} / \\binom{8}{4}= 1/70\\). Thus, the chance of observing a 3 or something more extreme, under the null hypothesis, is \\(\\approx 0.24\\). This is the p-value. The procedure that produced this p-value is called Fisher’s exact test and it uses the hypergeometric distribution."
+    "title": "18  Association tests",
+    "section": "\n18.2 Lady Tasting Tea",
+    "text": "18.2 Lady Tasting Tea\nR.A. Fisher2 was one of the first to formalize hypothesis testing. The “Lady Tasting Tea” is one of the most famous examples.\nThe story is as follows: an acquaintance of Fisher’s claimed that she could tell if milk was added before or after tea was poured. Fisher was skeptical and, consequently, designed an experiment to test this claim. He gave her four pairs of cups of tea: one with milk poured first, the other after. The order was randomized. The null hypothesis here is that she is guessing. Fisher derived the distribution for the number of correct picks on the assumption that the choices were random and independent.\nAs an example, suppose she identified 3 out of 4 correctly. Do we believe she has a special ability? The basic question we ask is: if the tester is actually guessing, what are the chances that she gets 3 or more correct? Just as we have done before, we can compute a probability under the null hypothesis that she is guessing for all 4. Under this null hypothesis, we can think of this particular example as picking 4 balls out of an urn with 4 blue (correct answer) and 4 red (incorrect answer) balls. Remember, she knows that there are four before tea and four after.\nUnder the null hypothesis that she is simply guessing, each ball has the same chance of being picked. We can then use combinations to determine each probability. The probability of picking 3 is \\(\\binom{4}{3} \\binom{4}{1} / \\binom{8}{4} = 16/70\\). The probability of picking all 4 correct is \\(\\binom{4}{4} \\binom{4}{0} / \\binom{8}{4}= 1/70\\). Thus, the chance of observing a 3 or something more extreme, under the null hypothesis, is \\(\\approx 0.24\\). This is the p-value. The procedure that produced this p-value is called Fisher’s exact test and it uses the hypergeometric distribution."
   },
   {
     "objectID": "linear-models/association-tests.html#two-by-two-tables",
     "href": "linear-models/association-tests.html#two-by-two-tables",
-    "title": "17  Association tests",
-    "section": "\n17.3 Two-by-two tables",
-    "text": "17.3 Two-by-two tables\nThe data from the experiment is usually summarized by a table like this:\n\ntab &lt;- matrix(c(3,1,1,3),2,2)\nrownames(tab)&lt;-c(\"Poured Before\",\"Poured After\")\ncolnames(tab)&lt;-c(\"Guessed before\",\"Guessed after\")\ntab\n#&gt;               Guessed before Guessed after\n#&gt; Poured Before              3             1\n#&gt; Poured After               1             3\n\nThese are referred to as a two-by-two table. For each of the four combinations one can get with a pair of binary variables, they show the observed counts for each occurrence.\nThe function fisher.test performs the inference calculations above:\n\nfisher.test(tab, alternative=\"greater\")$p.value\n#&gt; [1] 0.243"
+    "title": "18  Association tests",
+    "section": "\n18.3 Two-by-two tables",
+    "text": "18.3 Two-by-two tables\nThe data from the experiment is usually summarized by a table like this:\n\ntab &lt;- matrix(c(3,1,1,3),2,2)\nrownames(tab) &lt;- c(\"Poured Before\", \"Poured After\")\ncolnames(tab) &lt;- c(\"Guessed before\", \"Guessed after\")\ntab\n#&gt;               Guessed before Guessed after\n#&gt; Poured Before              3             1\n#&gt; Poured After               1             3\n\nThese are referred to as a two-by-two table. For each of the four combinations can result from a pair of binary variables, they display the observed counts for each occurrence.\nThe function fisher.test performs the inference calculations above:\n\nfisher.test(tab, alternative = \"greater\")$p.value\n#&gt; [1] 0.243"
   },
   {
     "objectID": "linear-models/association-tests.html#chi-square-test",
     "href": "linear-models/association-tests.html#chi-square-test",
-    "title": "17  Association tests",
-    "section": "\n17.4 Chi-square Test",
-    "text": "17.4 Chi-square Test\nNotice that, in a way, our funding rates example is similar to the Lady Tasting Tea. However, in the Lady Tasting Tea example, the number of blue and red beads is experimentally fixed and the number of answers given for each category is also fixed. This is because Fisher made sure there were four cups with milk poured before tea and four cups with milk poured after and the lady knew this, so the answers would also have to include four before and four afters. If this is the case, the sum of the rows and the sum of the columns are fixed. This defines constraints on the possible ways we can fill the two by two table and also permits us to use the hypergeometric distribution. In general, this is not the case. Nonetheless, there is another approach, the Chi-squared test, which is described below.\nImagine we have 290, 1,345, 177, 1,011 applicants, some are men and some are women and some get funded, whereas others don’t. We saw that the success rates for men and woman were:\n\ntotals |&gt; summarize(percent_men = yes_men/(yes_men+no_men),\n                    percent_women = yes_women/(yes_women+no_women))\n#&gt;   percent_men percent_women\n#&gt; 1       0.177         0.149\n\nrespectively. Would we see this again if we randomly assign funding at the overall rate:\n\nrate &lt;- with(totals, (yes_men + yes_women))/sum(totals)\nrate\n#&gt; [1] 0.165\n\nThe Chi-square test answers this question. The first step is to create the two-by-two data table:\n\ntwo_by_two &lt;- with(totals, data.frame(awarded = c(\"no\", \"yes\"), \n                                      men = c(no_men, yes_men),\n                                      women = c(no_women, yes_women)))\ntwo_by_two\n#&gt;   awarded  men women\n#&gt; 1      no 1345  1011\n#&gt; 2     yes  290   177\n\nThe general idea of the Chi-square test is to compare this two-by-two table to what you expect to see, which would be:\n\nwith(totals, data.frame(awarded = c(\"no\", \"yes\"), \n                        men = (no_men + yes_men) * c(1 - rate, rate),\n                        women = (no_women + yes_women) * c(1 - rate, rate)))\n#&gt;   awarded  men women\n#&gt; 1      no 1365   991\n#&gt; 2     yes  270   197\n\nWe can see that more men than expected and fewer women than expected received funding. However, under the null hypothesis these observations are random variables. The Chi-square test tells us how likely it is to see a deviation this large or larger. This test uses an asymptotic result, similar to the CLT, related to the sums of independent binary outcomes. The R function chisq.test takes a two-by-two table and returns the results from the test:\n\nchisq_test &lt;- chisq.test(two_by_two[, -1])\n\nWe see that the p-value is 0.0509:\n\nchisq_test$p.value\n#&gt; [1] 0.0509"
+    "title": "18  Association tests",
+    "section": "\n18.4 Chi-square Test",
+    "text": "18.4 Chi-square Test\nNotice that, in a sense, our funding rates example is similar to the Lady Tasting Tea. However, in the Lady Tasting Tea example, the number of blue and red beads is experimentally fixed and the number of answers given for each category is also fixed. This is because Fisher ensured there were four cups with milk poured before tea and four cups with milk poured after, and the lady knew this. Therefore, the answers would also have to include four before and four afters. In this case, the sum of the rows and the sum of the columns are fixed. This defines constraints on the possible ways we can fill the two by two table and also allows us to use the hypergeometric distribution. In general, this is not the case. Nonetheless, there is another approach, the Chi-squared test, which is described below.\nImagine we have a total of 290, 1,345, 177, 1,011 applicants– some are men and some are women, and some get funded while others do not. We saw that the success rates for men and women respectively were:\n\ntotals |&gt; summarize(percent_men = yes_men/(yes_men + no_men),\n                    percent_women = yes_women/(yes_women + no_women))\n#&gt;   percent_men percent_women\n#&gt; 1       0.177         0.149\n\nWould we see this again if we randomly assign funding at the overall rate:\n\nrate &lt;- with(totals, (yes_men + yes_women))/sum(totals)\nrate\n#&gt; [1] 0.165\n\nThe Chi-square test answers this question. The first step is to create the two-by-two data table:\n\no &lt;- with(totals, data.frame(men = c(no_men, yes_men),\n                             women = c(no_women, yes_women),\n                             row.names = c(\"no\", \"yes\")))\no\n#&gt;      men women\n#&gt; no  1345  1011\n#&gt; yes  290   177\n\nThe general idea of the Chi-square test is to compare this two-by-two table to what you expect to see, which would be:\n\ne &lt;- with(totals, data.frame(men = (no_men + yes_men) * c(1 - rate, rate),\n                             women = (no_women + yes_women) * c(1 - rate, rate),\n                             row.names = c(\"no\", \"yes\")))\n                       \ne\n#&gt;      men women\n#&gt; no  1365   991\n#&gt; yes  270   197\n\nWe can see that more men than expected and fewer women than expected received funding. However, under the null hypothesis these observations are random variables. The Chi-square statistic quantifies how much the observed tables deviates from the expected by:\n\nTaking the difference between each observed and expected cell value.\nSquaring this difference.\nDividing each squared difference by the expected value.\nSumming all these values together to get the final statistic.\n\n\nsum((o - e)^2/e)\n#&gt; [1] 4.01\n\nThe Chi-square test tells us how likely it is to see a deviation this large or larger. This test uses an asymptotic result, similar to the CLT, related to the sums of independent binary outcomes. The R function chisq.test takes a two-by-two table and returns the results from the test:\n\nchisq_test &lt;- chisq.test(o, correct = FALSE)\n\nWe see that the p-value is 0.045:\n\nchisq_test$p.value\n#&gt; [1] 0.0451\n\n\n\n\n\n\n\nBy default, the chisq.test function applies a continuity correction. This correction is particularly useful when a cell in the table has values close to 0, as it prevents low observed values from inflating the statistics. This achieved by subtracting 0.5 in the following way:\n\nsum((abs(o - e) - 0.5)^2/e)\n#&gt; [1] 3.81\n\nNote that it matches the default behavior:\n\nchisq.test(o)$statistic\n#&gt; X-squared \n#&gt;      3.81"
   },
   {
     "objectID": "linear-models/association-tests.html#sec-glm",
     "href": "linear-models/association-tests.html#sec-glm",
-    "title": "17  Association tests",
-    "section": "\n17.5 Generalized linear models",
-    "text": "17.5 Generalized linear models\nWe presented a way to perform hypothesis testing for determining if there is association between two binary outcome. But we have not yet described how to quantify effects. Can we estimate the effect of being a woman in funding success in the Netherlands? Note that if our outcomes are binary, then the linear models presented in the Chapter Chapter 16 are not appropriate because the \\(\\beta\\)s and \\(\\varepsilon\\) are continuous. However, an adaptation of these methods, that is widely used in, for example, medical studies, gives us a way to estimate effects along with their standard errors.\nThe idea is to model a transformation of the expected value of the outcomes with a linear model. The transformation is selected so that any continuous value is possible. The mathematical equation for a model with one variable looks like this:\n\\[\ng\\{\\mbox{E}(Y_i)\\} = \\beta_0 + \\beta_1 x_i\n\\]\nTo finish describing the model we impose a distribution on \\(Y\\) such as binomial or Poisson. These are referred to as _generalized linear models.\nWe demonstrate with the funding rates example. We define \\(Y_i\\) to be 1 if person \\(i\\) received funding and 0 otherwise and \\(x_i\\) to be 1 for person \\(i\\) is a women and 0 for men. For this data the expected value of \\(Y_i\\) is the probability of funding for person \\(i\\) \\(\\mbox{Pr}(Y_i=1)\\). We assume the outcomes \\(Y_i\\) are binomial with \\(N=1\\) and probability \\(p_i\\). For binomial data, the most widely used transformation is the logit function \\(g(p) = \\log \\{p / (1-p)\\}\\) which takes numbers between 0 and 1 to any continuous number. The model looks like this:\n\\[\n\\log \\frac{\\mbox{Pr}(Y_i=1)}{1-\\mbox{Pr}(Y_i=1)} = \\beta_0 +  \\beta_1 x_i\n\\]\n\n17.5.1 The odds ratio\nTo understand how \\(\\beta_1\\) can be used to quantify the effect of being a woman on success rates, first note that \\(\\mbox{Pr}(Y_i=1)/\\{1-\\mbox{Pr}(Y_i=1)\\} = \\mbox{Pr}(Y_i=1)/\\mbox{Pr}(Y_i=0)\\) is the odds of person \\(i\\) getting funding: the ratio of the probability of success and probability of failure. This implies that \\(e^{\\beta_0}\\) is the odds for men and \\(e^{\\beta_0}e^{\\beta_1}\\) is the odds for women, which implies \\(\\beta_1\\) is the odds for women divided by the odds for men. This quantity is called the odds ratio. To see this not that if use \\(p_1\\) and \\(p_0\\) to denote the probability of success for women and men, respectively, then \\(e^\\{beta_1\\) can be rewritten as\n\\[\ne^{\\beta_1} = \\frac{p_1}{1-p_1} \\, / \\, \\frac{p_0}{1-p_0}\n\\]\n\\(\\beta_1\\) therefore quantifies the log odds ratio.\nNow how do we estimate these parameters? Although the details are not described in this book, least squares is no longer an optimal way of estimating the parameters and instead we use an approach called maximum likelihood estimation (MLE). More advanced mathematical derivations show that a version of the central limit theorem applies and the estimates obtained this way are approximately normal when th number of observations is large. The theory also provides a way to calculate standard errors for the estimates of the \\(\\beta\\)s.\n\n17.5.2 Fitting the model\nTo obtain the maximum likelihood estimates using R we can use the glm function with the family argument set to binomial. This defaults to using the logit transformation. Note that we do not have the individual level data, but because we our model assumes the probability of success is the same for all women and all men, then the number of success can be modeled as binomial with \\(N_1\\) trials and probability \\(p_1\\) for women and binomial with \\(N_0\\) trials and probability \\(p_0\\) for men, with \\(N_1\\) and \\(N_0\\) the total number of women and men. In this case the glm function is used like this:\n\nsuccess &lt;- with(totals, c(yes_men, yes_women))\nfailure &lt;- with(totals, c(no_men, no_women))\ngender &lt;- factor(c(\"men\", \"women\"))\nfit &lt;- glm(cbind(success, failure) ~ gender, family = \"binomial\") \ncoefficients(summary(fit))\n#&gt;             Estimate Std. Error z value  Pr(&gt;|z|)\n#&gt; (Intercept)   -1.534     0.0647   -23.7 3.83e-124\n#&gt; genderwomen   -0.208     0.1041    -2.0  4.54e-02\n\nThe estimate of the odds ratio is 0.811982 which is interpreted as the odds being lowered by 20% for women as compared to men. But is this due to chance? We already noted that the p-value is about 0.05, but the GLM approach also permits us to compute confidence intervals using the confint function. To show the interval for the more interpretable odds ratio we simply exponentiate:\n\nexp(confint(fit, 2))\n#&gt;  2.5 % 97.5 % \n#&gt;  0.661  0.995\n\n\n\n\n\n\n\nWe have used a simple version of GLMs in which the only variable is binary. However, the method can be expanded to use multiple variables including continuous ones. However, in these contexts the log odds ratio interpretation becomes more complex. Also note that we have shown just one version of GLM appropriate for binomial data using a logit transformation. This version is referred to often referred to as logistic regression. However, GLM can be used with other transformation and distributions. You can learn more by consulting a GLM text book.\n\n\n\n\n17.5.3 Simple standard error approximation for two-by-two table odds ratio\nUsing glm we can obtain estimates, standard errors, and confidence intervals for a wide range of models. To do this we use a rather complex algorithms. In the case of two-by-two tables we can obtain a standard error for the log odds ratio using a simple approximation.\nIf our two-by-two tables has the following entries:\n\n\n\n\n\nMen\nWomen\n\n\n\nAwarded\na\nb\n\n\nNot Awarded\nc\nd\n\n\n\n\n\nIn this case, the odds ratio is simply \\(\\frac{a/c}{b/d} = \\frac{ad}{bc}\\). We can confirm we obtain the same estimate as when using glm:\n\nor &lt;- with(two_by_two, women[2]/sum(women) / (women[1]/sum(women)) / ((men[2]/sum(men)) / (men[1]/sum(men))))\nc(log(or), fit$coef[2])\n#&gt;             genderwomen \n#&gt;      -0.208      -0.208\n\nStatistical theory tells us that when all four entries of the two-by-two table are large enough, then the log odds ratio is approximately normal with standard error\n\\[\n\\sqrt{1/a + 1/b + 1/c + 1/d}\n\\]\nThis implies that a 95% confidence interval for the log odds ratio can be formed by:\n\\[\n\\log\\left(\\frac{ad}{bc}\\right) \\pm 1.96 \\sqrt{1/a + 1/b + 1/c + 1/d}\n\\]\nBy exponentiating these two numbers we can construct a confidence interval of the odds ratio.\nUsing R we can compute this confidence interval as follows:\n\nse &lt;- two_by_two |&gt; select(-awarded) |&gt;\n  summarize(se = sqrt(sum(1/men) + sum(1/women))) |&gt;\n  pull(se)\nexp(log(or) + c(-1,1) * qnorm(0.975) * se)\n#&gt; [1] 0.662 0.996\n\nNote that 1 is not included in the confidence interval which must mean that the p-value is smaller than 0.05. We can confirm this using:\n\n2*(1 - pnorm(abs(log(or)), 0, se))\n#&gt; [1] 0.0454\n\n\n\n\n\n\n\nNote that the p-values obtained with chisq.test, glm and this simple approximation are all slightly different. This is because these are both based on different approximation approaches."
+    "title": "18  Association tests",
+    "section": "\n18.5 Generalized linear models",
+    "text": "18.5 Generalized linear models\nWe presented a way to perform hypothesis testing for determining if there is association between two binary outcomes. But we have not yet described how to quantify effects. Can we estimate the effect of being a woman in funding success in the Netherlands? Note that if our outcomes are binary, then the linear models presented in the Chapter 17 are not appropriate because the \\(\\beta\\)s and \\(\\varepsilon\\) are continuous. However, an adaptation of these methods, that is widely used in, for example, medical studies, gives us a way to estimate effects along with their standard errors.\nThe idea is to model a transformation of the expected value of the outcomes with a linear model. The transformation is selected so that any continuous value is possible. The mathematical equation for a model with one variable looks like this:\n\\[\ng\\{\\mbox{E}(Y_i)\\} = \\beta_0 + \\beta_1 x_i\n\\]\nTo finish describing the model, we impose a distribution on \\(Y\\), such as binomial or Poisson. These are referred to as generalized linear models.\nWe illustrate this with the funding rates example. We define \\(Y_i\\) to be 1 if person \\(i\\) received funding and 0 otherwise, and \\(x_i\\) to be 1 for person \\(i\\) is a woman and 0 if they are a man. For this data, the expected value of \\(Y_i\\) is the probability of funding for person \\(i\\) \\(\\mbox{Pr}(Y_i=1)\\). We assume the outcomes \\(Y_i\\) are binomial, with \\(N=1\\) and probability \\(p_i\\). For binomial data, the most widely used transformation is the logit function \\(g(p) = \\log \\{p / (1-p)\\}\\), which takes numbers between 0 and 1 to any continuous number. The model looks like this:\n\\[\n\\log \\frac{\\mbox{Pr}(Y_i=1)}{1-\\mbox{Pr}(Y_i=1)} = \\beta_0 +  \\beta_1 x_i\n\\]\n\n18.5.1 The odds ratio\nTo understand how \\(\\beta_1\\) can be used to quantify the effect of being a woman on success rates, first note that \\(\\mbox{Pr}(Y_i=1)/\\{1-\\mbox{Pr}(Y_i=1)\\} = \\mbox{Pr}(Y_i=1)/\\mbox{Pr}(Y_i=0)\\) is the odds of person \\(i\\) getting funding: the ratio of the probability of success and probability of failure. This implies that \\(e^{\\beta_0}\\) is the odds for men and \\(e^{\\beta_0}e^{\\beta_1}\\) is the odds for women, which implies \\(e^{\\beta_1}\\) is the odds for women divided by the odds for men. This quantity is called the odds ratio. To see this, note that if use \\(p_1\\) and \\(p_0\\) to denote the probability of success for women and men, respectively, then \\(e^\\{beta_1\\) can be rewritten as:\n\\[\ne^{\\beta_1} = \\frac{p_1}{1-p_1} \\, / \\, \\frac{p_0}{1-p_0}\n\\]\n\\(\\beta_1\\) therefore quantifies the log odds ratio.\nNow how do we estimate these parameters? Although the details are not described in this book, least squares is no longer an optimal way of estimating the parameters and instead we use an approach called maximum likelihood estimation (MLE). More advanced mathematical derivations show that a version of the central limit theorem applies, and the estimates obtained this way are approximately normal when the number of observations is large. The theory also provides a way to calculate standard errors for the estimates of the \\(\\beta\\)s.\n\n18.5.2 Fitting the model\nTo obtain the maximum likelihood estimates using R, we can use the glm function with the family argument set to binomial. This defaults to using the logit transformation. Note that we do not have the individual level data, but because our model assumes the probability of success is the same for all women and all men, then the number of success can be modeled as binomial with \\(N_1\\) trials and probability \\(p_1\\) for women and binomial with \\(N_0\\) trials and probability \\(p_0\\) for men, where \\(N_1\\) and \\(N_0\\) are the total number of women and men. In this case, the glm function is used as follows:\n\nsuccess &lt;- with(totals, c(yes_men, yes_women))\nfailure &lt;- with(totals, c(no_men, no_women))\ngender &lt;- factor(c(\"men\", \"women\"))\nfit &lt;- glm(cbind(success, failure) ~ gender, family = \"binomial\") \ncoefficients(summary(fit))\n#&gt;             Estimate Std. Error z value  Pr(&gt;|z|)\n#&gt; (Intercept)   -1.534     0.0647   -23.7 3.83e-124\n#&gt; genderwomen   -0.208     0.1041    -2.0  4.54e-02\n\nThe estimate of the odds ratio is 0.811982, interpreted as the odds being lowered by 20% for women compared to men. But is this due to chance? We already noted that the p-value is about 0.05, but the GLM approach also permits us to compute confidence intervals using the confint function. To show the interval for the more interpretable odds ratio, we simply exponentiate:\n\nexp(confint(fit, 2))\n#&gt;  2.5 % 97.5 % \n#&gt;  0.661  0.995\n\n\n\n\n\n\n\nWe have used a simple version of GLMs in which the only variable is binary. However, the method can be expanded to incorporate multiple variables, including continuous ones. In these contexts, the log odds ratio interpretation becomes more complex. Also, note that we have shown just one version of GLM appropriate for binomial data using a logit transformation. This version is often referred to as logistic regression. Nevertheless, GLM can be used with other transformation and distributions. You can learn more by consulting a GLM textbook.\n\n\n\n\n18.5.3 Simple standard error approximation for two-by-two table odds ratio\nUsing glm, we can obtain estimates, standard errors, and confidence intervals for a wide range of models. To do this, we use a rather complex algorithms. In the case of two-by-two tables. we can obtain a standard error for the log odds ratio using a simple approximation.\nFIX SEE WHAT FOLLOWS If our two-by-two tables have the following entries:\n\n\n\n\n\nMen\nWomen\n\n\n\nAwarded\na\nb\n\n\nNot Awarded\nc\nd\n\n\n\n\n\nIn this case, the odds ratio is simply \\(\\frac{a/c}{b/d} = \\frac{ad}{bc}\\). We can confirm that we obtain the same estimate as when using glm:\n\ntwo_by_two &lt;- with(totals, data.frame(awarded = c(\"no\", \"yes\"), \n                                      men = c(no_men, yes_men),\n                                      women = c(no_women, yes_women)))\n\nor &lt;- with(two_by_two, women[2]/sum(women) / (women[1]/sum(women)) / ((men[2]/sum(men)) / (men[1]/sum(men))))\nc(log(or), fit$coef[2])\n#&gt;             genderwomen \n#&gt;      -0.208      -0.208\n\nStatistical theory tells us that when all four entries of the two-by-two table are large enough, the log odds ratio is approximately normal with standard error:\n\\[\n\\sqrt{1/a + 1/b + 1/c + 1/d}\n\\]\nThis implies that a 95% confidence interval for the log odds ratio can be formed by:\n\\[\n\\log\\left(\\frac{ad}{bc}\\right) \\pm 1.96 \\sqrt{1/a + 1/b + 1/c + 1/d}\n\\]\nBy exponentiating these two numbers, we can construct a confidence interval of the odds ratio.\nUsing R, we can compute this confidence interval as follows:\n\nse &lt;- two_by_two |&gt; select(-awarded) |&gt;\n  summarize(se = sqrt(sum(1/men) + sum(1/women))) |&gt;\n  pull(se)\nexp(log(or) + c(-1,1) * qnorm(0.975) * se)\n#&gt; [1] 0.662 0.996\n\nNote that 1 is not included in the confidence interval, implying that the p-value is smaller than 0.05. We can confirm this using:\n\n2*(1 - pnorm(abs(log(or)), 0, se))\n#&gt; [1] 0.0454\n\n\n\n\n\n\n\nKeep in mind that the p-values obtained with chisq.test, glm and this simple approximation are all slightly different. This is because these are both based on different approximation approaches."
   },
   {
     "objectID": "linear-models/association-tests.html#large-samples-small-p-values",
     "href": "linear-models/association-tests.html#large-samples-small-p-values",
-    "title": "17  Association tests",
-    "section": "\n17.6 Large samples, small p-values",
-    "text": "17.6 Large samples, small p-values\nAs mentioned earlier, reporting only p-values is not an appropriate way to report the results of data analysis. In scientific journals, for example, some studies seem to overemphasize p-values. Some of these studies have large sample sizes and report impressively small p-values. Yet when one looks closely at the results, we realize odds ratios are quite modest: barely bigger than 1. In this case the difference may not be practically significant or scientifically significant.\nNote that the relationship between odds ratio and p-value is not one-to-one. It depends on the sample size. So a very small p-value does not necessarily mean a very large odds ratio. Notice what happens to the p-value if we multiply our two-by-two table by 10, which does not change the odds ratio:\n\ntwo_by_two_x_10 &lt;- two_by_two |&gt; \n  select(-awarded) |&gt;\n  mutate(men = men*10, women = women*10) \nchisq.test(two_by_two_x_10)$p.value\n#&gt; [1] 2.63e-10\n\n:::{.callout-note title = “Small count correction”} Note that the log odds ratio is not defined if any of the cells of the two-by-two table is 0. This is because if \\(a\\), \\(b\\), \\(c\\), or \\(d\\) is 0, the \\(\\log(\\frac{ad}{bc})\\) is either the log of 0 or has a 0 in the denominator. For this situation, it is common practice to avoid 0s by adding 0.5 to each cell. This is referred to as the Haldane–Anscombe correction and has been shown, both in practice and theory, to work well. :::"
+    "title": "18  Association tests",
+    "section": "\n18.6 Large samples, small p-values",
+    "text": "18.6 Large samples, small p-values\nAs mentioned earlier, reporting only p-values is not an appropriate way to report the results of data analysis. In scientific journals, for example, some studies seem to overemphasize p-values. Some of these studies have large sample sizes and report impressively small p-values. Yet by looking closely at the results, we realize that the odds ratios are quite modest: barely bigger than 1. In this case, the difference may not be practically significant or scientifically significant.\nNote that the relationship between odds ratio and p-value is not one-to-one; it depends on the sample size. Therefore, a very small p-value does not necessarily mean a very large odds ratio. Observe what happens to the p-value if we multiply our two-by-two table by 10, which does not change the odds ratio:\n\ntwo_by_two_x_10 &lt;- two_by_two |&gt; \n  select(-awarded) |&gt;\n  mutate(men = men*10, women = women*10) \nchisq.test(two_by_two_x_10)$p.value\n#&gt; [1] 2.63e-10\n\n\n\n\n\n\n\nAlso, note that the log odds ratio is not defined if any of the cells of the two-by-two table is 0. This is because if \\(a\\), \\(b\\), \\(c\\), or \\(d\\) are 0, the \\(\\log(\\frac{ad}{bc})\\) is either the log of 0 or has a 0 in the denominator. For this situation, it is common practice to avoid 0s by adding 0.5 to each cell. This is referred to as the Haldane–Anscombe correction and has been shown, both in practice and theory, to work well."
   },
   {
     "objectID": "linear-models/association-tests.html#exercises",
     "href": "linear-models/association-tests.html#exercises",
-    "title": "17  Association tests",
-    "section": "\n17.7 Exercises",
-    "text": "17.7 Exercises\n1. A famous athlete has an impressive career, winning 70% of her 500 career matches. However, this athlete gets criticized because in important events, such as the Olympics, she has a losing record of 8 wins and 9 losses. Perform a Chi-square test to determine if this losing record can be simply due to chance as opposed to not performing well under pressure.\n2. Why did we use the Chi-square test instead of Fisher’s exact test in the previous exercise?\n\nIt actually does not matter, since they give the exact same p-value.\nFisher’s exact and the Chi-square are different names for the same test.\nBecause the sum of the rows and columns of the two-by-two table are not fixed so the hypergeometric distribution is not an appropriate assumption for the null hypothesis. For this reason, Fisher’s exact test is rarely applicable with observational data.\nBecause the Chi-square test runs faster.\n\n3. Compute the odds ratio of “losing under pressure” along with a confidence interval.\n4. Notice that the p-value is larger than 0.05 but the 95% confidence interval does not include 1. What explains this?\n\nWe made a mistake in our code.\nThese are based on t-statistics so the connection between p-value and confidence intervals does not apply.\nDifferent approximations are used for the p-value and the confidence interval calculation. If we had a larger sample size the match would be better.\nWe should use the Fisher exact test to get confidence intervals.\n\n5. Multiply the two-by-two table by 2 and see if the p-value and confidence retrieval are a better match.\n6. During the 2016 US presidential election, then candidate Donald J. Trump used his twitter account as a way to communicate with potential voters. Todd Vaziri hypothesized that “Every non-hyperbolic tweet is from iPhone (his staff). Every hyperbolic tweet is from Android (from him).” We will test this hypothesis using association tests. The following code coverts the tweets to a table with the counts for several sentiments from each source (Android or iPhone):\n\nlibrary(tidyverse)\nlibrary(dslabs)\nlibrary(tidytext)\n\nlinks &lt;- \"https://t.co/[A-Za-z\\\\d]+|&amp;\"\nnrc &lt;- get_sentiments(\"nrc\") |&gt; select(word, sentiment)\nandroid_iphone &lt;- trump_tweets |&gt; \n  extract(source, \"source\", \"Twitter for (.*)\") |&gt;\n  filter(source %in% c(\"Android\", \"iPhone\") &\n           created_at &gt;= ymd(\"2015-06-17\") & \n           created_at &lt; ymd(\"2016-11-08\")) |&gt;\n  filter(!is_retweet) |&gt;\n  arrange(created_at) |&gt; \n  mutate(text = str_replace_all(text, links, \"\"))  |&gt;\n  unnest_tokens(word, text) |&gt;\n  filter(!word %in% stop_words$word &\n           !str_detect(word, \"^\\\\d+$\")) |&gt;\n  mutate(word = str_replace(word, \"^'\", \"\")) |&gt;\n  filter(!word %in% stop_words$word)\n\nsentiment_counts &lt;- android_iphone |&gt;\n  left_join(nrc, by = \"word\", relationship = \"many-to-many\") |&gt;\n  count(source, sentiment) |&gt;\n  pivot_wider(names_from = \"source\", values_from = \"n\") |&gt;\n  mutate(sentiment = replace_na(sentiment, replace = \"none\"))\n\nCompute an odds ratio comparing Android to iPhone for each sentiment and add it to the table.\n7. Compute a 95% confidence interval for each odds ratio.\n8. Generate a plot showing the estimated odds ratios along with their confidence intervals.\n9. Test the null hypothesis that there is no difference between tweets from Android and iPhone and report the sentiments with p-values less than 0.05 and more likely to come from Android.\n10. For each sentiment, find the words assigned to that sentiment, keep words that appear at least 25 times, compute the odd ratio for each, and show a barplot for those with odds ratio larger than 2 or smaller than 1/2."
+    "title": "18  Association tests",
+    "section": "\n18.7 Exercises",
+    "text": "18.7 Exercises\n1. A famous athlete boasts an impressive career, winning 70% of her 500 career matches. Nevertheless, this athlete is criticized because in important events, such as the Olympics, she has a losing record of 8 wins and 9 losses. Perform a Chi-square test to determine if this losing record can be simply due to chance as opposed to not performing well under pressure.\n2. Why did we use the Chi-square test instead of Fisher’s exact test in the previous exercise?\n\nIt actually does not matter, since they give the exact same p-value.\nFisher’s exact and the Chi-square are different names for the same test.\nBecause the sum of the rows and columns of the two-by-two table are not fixed so the hypergeometric distribution is not an appropriate assumption for the null hypothesis. For this reason, Fisher’s exact test is rarely applicable with observational data.\nBecause the Chi-square test runs faster.\n\n3. Compute the odds ratio of “losing under pressure” along with a confidence interval.\n4. Notice that the p-value is larger than 0.05, but the 95% confidence interval does not include 1. What explains this?\n\nWe made a mistake in our code.\nThese are based on t-statistics so the connection between p-value and confidence intervals does not apply.\nDifferent approximations are used for the p-value and the confidence interval calculation. If we had a larger sample size, the match would be better.\nWe should use the Fisher exact test to get confidence intervals.\n\n5. Multiply the two-by-two table by 2 and see if the p-value and confidence retrieval are a better match.\n6. FIX Use the research_funding_rates data to estimate the log odds ratio along and standard errors comparing women to men for each discipline. Compute a confidence interval and report all the disciplines for which one gender appears to be favored over the other.\n7. Divide the log odds ratio estimates by their respective standard errors and generate a qqplot comparing these to a standard normal. Do any of the disciplines deviate from what is expected by chance?\n8. During the 2016 US presidential election, then candidate Donald J. Trump used his twitter account as a way to communicate with potential voters. Todd Vaziri hypothesized that “Every non-hyperbolic tweet is from iPhone (his staff). Every hyperbolic tweet is from Android (from him).” We will test this hypothesis using association tests. The dslabs object sentiment_counts provides a table with the counts for several sentiments from each source (Android or iPhone):\n\nlibrary(tidyverse)\nlibrary(dslabs)\nsentiment_counts\n\nCompute an odds ratio comparing Android to iPhone for each sentiment and add it to the table.\n9. Compute a 95% confidence interval for each odds ratio.\n10. Generate a plot showing the estimated odds ratios along with their confidence intervals.\n11. FIX Test the null hypothesis that there is no difference between tweets from Android and iPhone and report the sentiments with p-values less than 0.05 and more likely to come from Android.\n12. For each sentiment, find the words assigned to that sentiment, keep words that appear at least 25 times, compute the odd ratio for each, and show a barplot for those with odds ratio larger than 2 or smaller than 1/2."
   },
   {
     "objectID": "linear-models/association-tests.html#footnotes",
     "href": "linear-models/association-tests.html#footnotes",
-    "title": "17  Association tests",
+    "title": "18  Association tests",
     "section": "",
     "text": "http://www.pnas.org/content/112/40/12349.abstract↩︎\nhttps://en.wikipedia.org/wiki/Ronald_Fisher↩︎"
   },
   {
     "objectID": "linear-models/association-not-causation.html#spurious-correlation",
     "href": "linear-models/association-not-causation.html#spurious-correlation",
-    "title": "\n18  Association is not causation\n",
-    "section": "\n18.1 Spurious correlation",
-    "text": "18.1 Spurious correlation\nThe following comical example underscores that correlation is not causation. It shows a very strong correlation between divorce rates and margarine consumption.\n\n\n\n\n\n\n\n\nDoes this mean that margarine causes divorces? Or do divorces cause people to eat more margarine? Of course the answer to both these questions is no. This is just an example of what we call a spurious correlation.\nYou can see many more absurd examples on the Spurious Correlations website1.\nThe cases presented in the spurious correlation site are all instances of what is generally called data dredging, data fishing, or data snooping. It’s basically a form of what in the US they call cherry picking. An example of data dredging would be if you look through many results produced by a random process and pick the one that shows a relationship that supports a theory you want to defend.\nA Monte Carlo simulation can be used to show how data dredging can result in finding high correlations among uncorrelated variables. We will save the results of our simulation into a tibble:\n\nlibrary(tidyverse)\nN &lt;- 25\ng &lt;- 1000000\nsim_data &lt;- tibble(group = rep(1:g, each = N), \n                   x = rnorm(N*g), \n                   y = rnorm(N*g))\n\nThe first column denotes group. We created groups and for each one we generated a pair of independent vectors, \\(X\\) and \\(Y\\), with 25 observations each, stored in the second and third columns. Because we constructed the simulation, we know that \\(X\\) and \\(Y\\) are not correlated.\nNext, we compute the correlation between X and Y for each group and look at the max:\n\nres &lt;- sim_data |&gt; \n  group_by(group) |&gt; \n  summarize(r = cor(x, y)) |&gt; \n  arrange(desc(r))\nres\n#&gt; # A tibble: 1,000,000 × 2\n#&gt;    group     r\n#&gt;    &lt;int&gt; &lt;dbl&gt;\n#&gt; 1 606777 0.789\n#&gt; 2 949026 0.781\n#&gt; 3 752659 0.774\n#&gt; 4 815223 0.773\n#&gt; 5 890876 0.768\n#&gt; # ℹ 999,995 more rows\n\nWe see a maximum correlation of 0.7892932 and if you just plot the data from the group achieving this correlation, it shows a convincing plot that \\(X\\) and \\(Y\\) are in fact correlated:\n\nsim_data |&gt; filter(group == res$group[which.max(res$r)]) |&gt;\n  ggplot(aes(x, y)) +\n  geom_point() + \n  geom_smooth(method = \"lm\")\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nRemember that the correlation summary is a random variable. Here is the distribution generated by the Monte Carlo simulation:\n\nres |&gt; ggplot(aes(x=r)) + geom_histogram(binwidth = 0.1, color = \"black\")\n\n\n\n\n\n\n\nIt’s just a mathematical fact that if we observe random correlations that are expected to be 0, but have a standard error of 0.2041507, the largest one will be close to 1.\nIf we performed regression on this group and interpreted the p-value, we would incorrectly claim this was a statistically significant relation:\n\nlibrary(broom)\nsim_data |&gt; \n  filter(group == res$group[which.max(res$r)]) |&gt;\n  summarize(tidy(lm(y ~ x))) |&gt; \n  filter(term == \"x\")\n#&gt; # A tibble: 1 × 5\n#&gt;   term  estimate std.error statistic    p.value\n#&gt;   &lt;chr&gt;    &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;      &lt;dbl&gt;\n#&gt; 1 x        0.690     0.112      6.16 0.00000274\n\nThis particular form of data dredging is referred to as p-hacking. P-hacking is a topic of much discussion because it is a problem in scientific publications. Because publishers tend to reward statistically significant results over negative results, there is an incentive to report significant results. In epidemiology and the social sciences, for example, researchers may look for associations between an adverse outcome and several exposures and report only the one exposure that resulted in a small p-value. Furthermore, they might try fitting several different models to account for confounding and pick the one that yields the smallest p-value. In experimental disciplines, an experiment might be repeated more than once, yet only the results of the one experiment with a small p-value reported. This does not necessarily happen due to unethical behavior, but rather as a result of statistical ignorance or wishful thinking. In advanced statistics courses, you can learn methods to adjust for these multiple comparisons."
+    "title": "\n19  Association is not causation\n",
+    "section": "\n19.1 Spurious correlation",
+    "text": "19.1 Spurious correlation\nThe following comical example underscores the concept that correlation is not causation. It shows a very strong correlation between divorce rates and margarine consumption.\n\n\n\n\n\n\n\n\nDoes this mean that margarine causes divorces? Or do divorces cause people to eat more margarine? Of course. the answer to both these questions is no. This is just an example of what we call a spurious correlation.\nYou can see many more absurd examples on the Spurious Correlations website1.\nThe cases presented on the website are all instances of what is generally called data dredging, data fishing, or data snooping. It’s basically a form of what in the US they call cherry picking. An example of data dredging would be if you look through many results produced by a random process and pick the one that shows a relationship that supports a theory you want to defend.\nA Monte Carlo simulation can be used to show how data dredging can result in finding high correlations among uncorrelated variables. We will save the results of our simulation into a tibble:\n\nlibrary(tidyverse)\nN &lt;- 25\ng &lt;- 1000000\nsim_data &lt;- tibble(group = rep(1:g, each = N), \n                   x = rnorm(N*g), \n                   y = rnorm(N*g))\n\nThe first column denotes group. We created groups. For each group, we generated a pair of independent vectors, \\(X\\) and \\(Y\\), with 25 observations each, stored in the second and third columns. Because we constructed the simulation, we know that \\(X\\) and \\(Y\\) are not correlated.\nNext, we compute the correlation between X and Y for each group and look at the max:\n\nres &lt;- sim_data |&gt; \n  group_by(group) |&gt; \n  summarize(r = cor(x, y)) |&gt; \n  arrange(desc(r))\nres\n#&gt; # A tibble: 1,000,000 × 2\n#&gt;    group     r\n#&gt;    &lt;int&gt; &lt;dbl&gt;\n#&gt; 1 180152 0.796\n#&gt; 2 902354 0.790\n#&gt; 3 410920 0.785\n#&gt; 4 796016 0.760\n#&gt; 5 542451 0.760\n#&gt; # ℹ 999,995 more rows\n\nWe see a maximum correlation of 0.7957485. If you just plot the data from the group achieving this correlation, it shows a convincing plot that \\(X\\) and \\(Y\\) are in fact correlated:\n\nsim_data |&gt; filter(group == res$group[which.max(res$r)]) |&gt;\n  ggplot(aes(x, y)) +\n  geom_point() + \n  geom_smooth(method = \"lm\")\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nRemember that the correlation summary is a random variable. Here is the distribution generated by the Monte Carlo simulation:\n\nres |&gt; ggplot(aes(x=r)) + geom_histogram(binwidth = 0.1, color = \"black\")\n\n\n\n\n\n\n\nIt’s simply a mathematical fact that if we observe random correlations that are expected to be 0, but have a standard error of 0.2039625, the largest one will be close to 1.\nIf we performed regression on this group and interpreted the p-value, we would incorrectly claim this was a statistically significant relation:\n\nlibrary(broom)\nsim_data |&gt; \n  filter(group == res$group[which.max(res$r)]) |&gt;\n  summarize(tidy(lm(y ~ x))) |&gt; \n  filter(term == \"x\")\n#&gt; # A tibble: 1 × 5\n#&gt;   term  estimate std.error statistic    p.value\n#&gt;   &lt;chr&gt;    &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;      &lt;dbl&gt;\n#&gt; 1 x        0.971     0.154      6.30 0.00000198\n\nThis particular form of data dredging is referred to as p-hacking. P-hacking is a topic of much discussion because it poses a problem in scientific publications. Since publishers tend to reward statistically significant results over negative results, there is an incentive to report significant results. In epidemiology and the social sciences, for example, researchers may look for associations between an adverse outcome and several exposures, and report only the one exposure that resulted in a small p-value. Furthermore, they might try fitting several different models to account for confounding and choose the one that yields the smallest p-value. In experimental disciplines, an experiment might be repeated more than once, yet only the results of the one experiment with a small p-value reported. This does not necessarily happen due to unethical behavior, but rather as a result of statistical ignorance or wishful thinking. In advanced statistics courses, you can learn methods to adjust for these multiple comparisons."
   },
   {
     "objectID": "linear-models/association-not-causation.html#outliers",
     "href": "linear-models/association-not-causation.html#outliers",
-    "title": "\n18  Association is not causation\n",
-    "section": "\n18.2 Outliers",
-    "text": "18.2 Outliers\nSuppose we take measurements from two independent outcomes, \\(X\\) and \\(Y\\), and we standardize the measurements. However, imagine we make a mistake and forget to standardize entry 23. We can simulate such data using:\n\nset.seed(1985)\nx &lt;- rnorm(100,100,1)\ny &lt;- rnorm(100,84,1)\nx[-23] &lt;- scale(x[-23])\ny[-23] &lt;- scale(y[-23])\n\nThe data look like this:\n\nqplot(x, y)\n#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.\n\n\n\n\n\n\n\nNot surprisingly, the correlation is very high:\n\ncor(x,y)\n#&gt; [1] 0.988\n\nBut this is driven by the one outlier. If we remove this outlier, the correlation is greatly reduced to almost 0, which is what it should be:\n\ncor(x[-23], y[-23])\n#&gt; [1] -0.0442\n\nThere is an alternative to the sample correlation for estimating the population correlation that is robust to outliers. It is called Spearman correlation. The idea is simple: compute the correlation on the ranks of the values. Here is a plot of the ranks plotted against each other:\n\nqplot(rank(x), rank(y))\n\n\n\n\n\n\n\nThe outlier is no longer associated with a very large value and the correlation comes way down:\n\ncor(rank(x), rank(y))\n#&gt; [1] 0.00251\n\nSpearman correlation can also be calculated like this:\n\ncor(x, y, method = \"spearman\")\n#&gt; [1] 0.00251\n\nThere are also methods for robust fitting of linear models which you can learn about in, for instance, this book: Robust Statistics: Edition 2 by Peter J. Huber & Elvezio M. Ronchetti."
+    "title": "\n19  Association is not causation\n",
+    "section": "\n19.2 Outliers",
+    "text": "19.2 Outliers\nSuppose we take measurements from two independent outcomes, \\(X\\) and \\(Y\\), and we standardize the measurements. However, imagine we make a mistake and forget to standardize entry 23. We can simulate such data using:\n\nset.seed(1985)\nx &lt;- rnorm(100,100,1)\ny &lt;- rnorm(100,84,1)\nx[-23] &lt;- scale(x[-23])\ny[-23] &lt;- scale(y[-23])\n\nThe data look like this:\n\nqplot(x, y)\n#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.\n\n\n\n\n\n\n\nNot surprisingly, the correlation is very high:\n\ncor(x,y)\n#&gt; [1] 0.988\n\nBut this is driven by the one outlier. If we remove this outlier, the correlation is greatly reduced to almost 0, which is what it should be:\n\ncor(x[-23], y[-23])\n#&gt; [1] -0.0442\n\nThere is an alternative to the sample correlation for estimating the population correlation that is robust to outliers. It is called Spearman correlation. The idea is simple: compute the correlation on the ranks of the values. Here is a plot of the ranks plotted against each other:\n\nqplot(rank(x), rank(y))\n\n\n\n\n\n\n\nThe outlier is no longer associated with a very large value, and the correlation decreases significantly:\n\ncor(rank(x), rank(y))\n#&gt; [1] 0.00251\n\nSpearman correlation can also be calculated like this:\n\ncor(x, y, method = \"spearman\")\n#&gt; [1] 0.00251\n\nThere are also methods for robust fitting of linear models which you can learn about in, for instance, this book: Robust Statistics: Edition 2 by Peter J. Huber & Elvezio M. Ronchetti."
   },
   {
     "objectID": "linear-models/association-not-causation.html#reversing-cause-and-effect",
     "href": "linear-models/association-not-causation.html#reversing-cause-and-effect",
-    "title": "\n18  Association is not causation\n",
-    "section": "\n18.3 Reversing cause and effect",
-    "text": "18.3 Reversing cause and effect\nAnother way association is confused with causation is when the cause and effect are reversed. An example of this is claiming that tutoring makes students perform worse because they test lower than peers that are not tutored. In this case, the tutoring is not causing the low test scores, but the other way around.\nA form of this claim actually made it into an op-ed in the New York Times titled Parental Involvement Is Overrated2. Consider this quote from the article:\n\n\nWhen we examined whether regular help with homework had a positive impact on children’s academic performance, we were quite startled by what we found. Regardless of a family’s social class, racial or ethnic background, or a child’s grade level, consistent homework help almost never improved test scores or grades… Even more surprising to us was that when parents regularly helped with homework, kids usually performed worse.\n\n\nA very likely possibility is that the children needing regular parental help, receive this help because they don’t perform well in school.\nWe can easily construct an example of cause and effect reversal using the father and son height data. If we fit the model:\n\\[X_i = \\beta_0 + \\beta_1 y_i + \\varepsilon_i, i=1, \\dots, N\\]\nto the father and son height data, with \\(X_i\\) the father height and \\(y_i\\) the son height, we do get a statistically significant result. We use the galton_heights dataset defined in Chapter Chapter 13:\n\ngalton_heights |&gt; summarize(tidy(lm(father ~ son)))\n#&gt; Warning: Returning more (or less) than 1 row per `summarise()` group was\n#&gt; deprecated in dplyr 1.1.0.\n#&gt; ℹ Please use `reframe()` instead.\n#&gt; ℹ When switching from `summarise()` to `reframe()`, remember that\n#&gt;   `reframe()` always returns an ungrouped data frame and adjust\n#&gt;   accordingly.\n#&gt; # A tibble: 2 × 5\n#&gt;   term        estimate std.error statistic  p.value\n#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;\n#&gt; 1 (Intercept)   40.9      4.40        9.29 5.47e-17\n#&gt; 2 son            0.407    0.0636      6.40 1.36e- 9\n\nThe model fits the data very well. If we look at the mathematical formulation of the model above, it could easily be incorrectly interpreted so as to suggest that the son being tall caused the father to be tall. But given what we know about genetics and biology, we know it’s the other way around. The model is technically correct. The estimates and p-values were obtained correctly as well. What is wrong here is the interpretation."
+    "title": "\n19  Association is not causation\n",
+    "section": "\n19.3 Reversing cause and effect",
+    "text": "19.3 Reversing cause and effect\nAnother way association is confused with causation is when the cause and effect are reversed. An example of this is claiming that tutoring makes students perform worse because they test lower than peers that are not tutored. In this case, the tutoring is not causing the low test scores, but the other way around.\nA form of this claim actually made it into an op-ed in the New York Times titled Parental Involvement Is Overrated2. Consider this quote from the article:\n\n\nWhen we examined whether regular help with homework had a positive impact on children’s academic performance, we were quite startled by what we found. Regardless of a family’s social class, racial or ethnic background, or a child’s grade level, consistent homework help almost never improved test scores or grades… Even more surprising to us was that when parents regularly helped with homework, kids usually performed worse.\n\n\nA very likely possibility is that the children needing regular parental help, receive this help because they don’t perform well in school.\nWe can easily construct an example of cause and effect reversal using the father and son height data. If we fit the model:\n\\[X_i = \\beta_0 + \\beta_1 y_i + \\varepsilon_i, i=1, \\dots, N\\]\nto the father and son height data, where \\(X_i\\) is the father height and \\(y_i\\) is the son height, we do get a statistically significant result. We use the galton_heights dataset defined in Chapter 14:\n\ngalton_heights |&gt; summarize(tidy(lm(father ~ son)))\n#&gt; Warning: Returning more (or less) than 1 row per `summarise()` group was\n#&gt; deprecated in dplyr 1.1.0.\n#&gt; ℹ Please use `reframe()` instead.\n#&gt; ℹ When switching from `summarise()` to `reframe()`, remember that\n#&gt;   `reframe()` always returns an ungrouped data frame and adjust\n#&gt;   accordingly.\n#&gt; # A tibble: 2 × 5\n#&gt;   term        estimate std.error statistic  p.value\n#&gt;   &lt;chr&gt;          &lt;dbl&gt;     &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;\n#&gt; 1 (Intercept)   40.9      4.40        9.29 5.47e-17\n#&gt; 2 son            0.407    0.0636      6.40 1.36e- 9\n\nThe model fits the data very well. If we examine the mathematical formulation of the model above, it could easily be misinterpreted so as to suggest that the son being tall caused the father to be tall. However, based on our understanding of genetics and biology, we know it’s the other way around. The model is technically correct. The estimates and p-values were obtained correctly as well. What is wrong here is the interpretation."
   },
   {
     "objectID": "linear-models/association-not-causation.html#confounders",
     "href": "linear-models/association-not-causation.html#confounders",
-    "title": "\n18  Association is not causation\n",
-    "section": "\n18.4 Confounders",
-    "text": "18.4 Confounders\nConfounders are perhaps the most common reason that leads to associations begin misinterpreted.\nIf \\(X\\) and \\(Y\\) are correlated, we call \\(Z\\) a confounder if changes in \\(Z\\) causes changes in both \\(X\\) and \\(Y\\). Earlier, when studying baseball data, we saw how Home Runs was a confounder that resulted in a higher correlation than expected when studying the relationship between Bases on Balls and Runs. In some cases, we can use linear models to account for confounders. However, this is not always the case.\nIncorrect interpretation due to confounders is ubiquitous in the lay press and they are often hard to detect. Here, we present a widely used example related to college admissions.\n\n18.4.1 Example: UC Berkeley admissions\nAdmission data from six U.C. Berkeley majors, from 1973, showed that more men were being admitted than women: 44% men were admitted compared to 30% women. PJ Bickel, EA Hammel, and JW O’Connell. Science (1975). We can load the data and compute a statistical test, which clearly rejects the hypothesis that gender and admission are independent:\n\ntwo_by_two &lt;- admissions |&gt; group_by(gender) |&gt; \n  summarize(total_admitted = round(sum(admitted / 100 * applicants)), \n            not_admitted = sum(applicants) - sum(total_admitted)) |&gt;\n  select(-gender) \n  \nchisq.test(two_by_two)$p.value\n#&gt; [1] 1.06e-21\n\nBut closer inspection shows a paradoxical result. Here are the percent admissions by major:\n\nadmissions |&gt; select(major, gender, admitted) |&gt;\n  pivot_wider(names_from = \"gender\", values_from = \"admitted\") |&gt;\n  mutate(women_minus_men = women - men)\n#&gt; # A tibble: 6 × 4\n#&gt;   major   men women women_minus_men\n#&gt;   &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt;           &lt;dbl&gt;\n#&gt; 1 A        62    82              20\n#&gt; 2 B        63    68               5\n#&gt; 3 C        37    34              -3\n#&gt; 4 D        33    35               2\n#&gt; 5 E        28    24              -4\n#&gt; # ℹ 1 more row\n\nFour out of the six majors favor women. More importantly, all the differences are much smaller than the 14.2 difference that we see when examining the totals.\nThe paradox is that analyzing the totals suggests a dependence between admission and gender, but when the data is grouped by major, this dependence seems to disappear. What’s going on? This actually can happen if an uncounted confounder is driving most of the variability.\nSo let’s define three variables: \\(X\\) is 1 for men and 0 for women, \\(Y\\) is 1 for those admitted and 0 otherwise, and \\(Z\\) quantifies the selectivity of the major. A gender bias claim would be based on the fact that \\(\\mbox{Pr}(Y=1 | X = x)\\) is higher for \\(x=1\\) than \\(x=0\\). However, \\(Z\\) is an important confounder to consider. Clearly \\(Z\\) is associated with \\(Y\\), as the more selective a major, the lower \\(\\mbox{Pr}(Y=1 | Z = z)\\). But is major selectivity \\(Z\\) associated with gender \\(X\\)?\nOne way to see this is to plot the total percent admitted to a major versus the percent of women that made up the applicants:\n\nadmissions |&gt; \n  group_by(major) |&gt; \n  summarize(major_selectivity = sum(admitted * applicants)/sum(applicants),\n            percent_women_applicants = sum(applicants * (gender==\"women\")) /\n                                             sum(applicants) * 100) |&gt;\n  ggplot(aes(major_selectivity, percent_women_applicants, label = major)) +\n  geom_text()\n\n\n\n\n\n\n\nThere seems to be association. The plot suggests that women were much more likely to apply to the two “hard” majors: gender and major’s selectivity are confounded. Compare, for example, major B and major E. Major E is much harder to enter than major B and over 60% of applicants to major E were women, while less than 30% of the applicants of major B were women.\n\n18.4.2 Confounding explained graphically\nThe following plot shows the number of applicants that were admitted and those that were not by:\n\n\n\n\n\n\n\n\nIt also breaks down the acceptances by major. This breakdown allows us to see that the majority of accepted men came from two majors: A and B. It also lets us see that few women applied to these majors.\n\n18.4.3 Average after stratifying\nIn this plot, we can see that if we condition or stratify by major, and then look at differences, we control for the confounder and this effect goes away:\n\nadmissions |&gt; \n  ggplot(aes(major, admitted, col = gender, size = applicants)) +\n  geom_point()\n\n\n\n\n\n\n\nNow we see that major by major, there is not much difference. The size of the dot represents the number of applicants, and explains the paradox: we see large red dots and small blue dots for the easiest majors, A and B.\nIf we average the difference by major, we find that the percent is actually 3.5% higher for women.\n\nadmissions |&gt;  group_by(gender) |&gt; summarize(average = mean(admitted))\n#&gt; # A tibble: 2 × 2\n#&gt;   gender average\n#&gt;   &lt;chr&gt;    &lt;dbl&gt;\n#&gt; 1 men       38.2\n#&gt; 2 women     41.7"
+    "title": "\n19  Association is not causation\n",
+    "section": "\n19.4 Confounders",
+    "text": "19.4 Confounders\nConfounders are perhaps the most common reason that leads to associations begin misinterpreted.\nIf \\(X\\) and \\(Y\\) are correlated, we call \\(Z\\) a confounder if changes in \\(Z\\) cause changes in both \\(X\\) and \\(Y\\). Earlier, when studying baseball data, we saw how Home Runs were a confounder that resulted in a higher correlation than expected when studying the relationship between Bases on Balls and Runs. In some cases, we can use linear models to account for confounders. However, this is not always the case.\nIncorrect interpretation due to confounders is ubiquitous in the lay press and they are often hard to detect. Here, we present a widely used example related to college admissions.\n\n19.4.1 Example: UC Berkeley admissions\nAdmission data from six U.C. Berkeley majors, from 1973, showed that more men were being admitted than women: 44% men were admitted compared to 30% women. PJ Bickel, EA Hammel, and JW O’Connell. Science (1975). We can load the data and compute a statistical test, which clearly rejects the hypothesis that gender and admission are independent:\n\ntwo_by_two &lt;- admissions |&gt; group_by(gender) |&gt; \n  summarize(total_admitted = round(sum(admitted / 100 * applicants)), \n            not_admitted = sum(applicants) - sum(total_admitted)) |&gt;\n  select(-gender) \n  \nchisq.test(two_by_two)$p.value\n#&gt; [1] 1.06e-21\n\nBut closer inspection shows a paradoxical result. Here are the percent admissions by major:\n\nadmissions |&gt; select(major, gender, admitted) |&gt;\n  pivot_wider(names_from = \"gender\", values_from = \"admitted\") |&gt;\n  mutate(women_minus_men = women - men)\n#&gt; # A tibble: 6 × 4\n#&gt;   major   men women women_minus_men\n#&gt;   &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt;           &lt;dbl&gt;\n#&gt; 1 A        62    82              20\n#&gt; 2 B        63    68               5\n#&gt; 3 C        37    34              -3\n#&gt; 4 D        33    35               2\n#&gt; 5 E        28    24              -4\n#&gt; # ℹ 1 more row\n\nFour out of the six majors favor women. More importantly, all the differences are much smaller than the 14.2 difference that we see when examining the totals.\nThe paradox is that analyzing the totals suggests a dependence between admission and gender, but when the data is grouped by major, this dependence seems to disappear. What’s going on? This actually can happen if an uncounted confounder is driving most of the variability.\nSo let’s define three variables: \\(X\\) is 1 for men and 0 for women, \\(Y\\) is 1 for those admitted and 0 otherwise, and \\(Z\\) quantifies the selectivity of the major. A gender bias claim would be based on the fact that \\(\\mbox{Pr}(Y=1 | X = x)\\) is higher for \\(x=1\\) than for \\(x=0\\). However, \\(Z\\) is an important confounder to consider. Clearly, \\(Z\\) is associated with \\(Y\\), as the more selective a major, the lower \\(\\mbox{Pr}(Y=1 | Z = z)\\). But is major selectivity \\(Z\\) associated with gender \\(X\\)?\nOne way to see this is to plot the total percent admitted to a major versus the percent of women that made up the applicants:\n\nadmissions |&gt; \n  group_by(major) |&gt; \n  summarize(major_selectivity = sum(admitted * applicants)/sum(applicants),\n            percent_women_applicants = sum(applicants * (gender==\"women\")) /\n                                             sum(applicants) * 100) |&gt;\n  ggplot(aes(major_selectivity, percent_women_applicants, label = major)) +\n  geom_text()\n\n\n\n\n\n\n\nThere seems to be association. The plot suggests that women were much more likely to apply to the two “hard” majors, indicating a confounding between gender and major’s selectivity. Compare, for example, major B and major E. Major E is much harder to enter than major B, and over 60% of applicants for major E were women, while less than 30% of the applicants for major B were women.\n\n19.4.2 Confounding explained graphically\nThe following plot shows the number of applicants that were admitted and those that were not by major and gender:\n\n\n\n\n\n\n\n\nIt also breaks down the acceptances by major. This breakdown allows us to see that the majority of accepted men came from two majors, A and B. It also reveals that few women applied to these majors.\n\n19.4.3 Average after stratifying\nIn this plot, we can see that if we condition or stratify by major, and then look at differences, we control for the confounder and this effect goes away:\n\nadmissions |&gt; \n  ggplot(aes(major, admitted, col = gender, size = applicants)) +\n  geom_point()\n\n\n\n\n\n\n\nNow we see that major by major, there is not much difference. The size of the dot represents the number of applicants, and explains the paradox: we see large red dots and small blue dots for the easiest majors, A and B.\nIf we average the difference by major, we find that the percent is actually 3.5% higher for women.\n\nadmissions |&gt;  group_by(gender) |&gt; summarize(average = mean(admitted))\n#&gt; # A tibble: 2 × 2\n#&gt;   gender average\n#&gt;   &lt;chr&gt;    &lt;dbl&gt;\n#&gt; 1 men       38.2\n#&gt; 2 women     41.7"
   },
   {
     "objectID": "linear-models/association-not-causation.html#simpsons-paradox",
     "href": "linear-models/association-not-causation.html#simpsons-paradox",
-    "title": "\n18  Association is not causation\n",
-    "section": "\n18.5 Simpson’s paradox",
-    "text": "18.5 Simpson’s paradox\nThe case we have just covered is an example of Simpson’s paradox. It is called a paradox because we see the sign of the correlation flip when comparing the entire publication and specific strata. As an illustrative example, suppose you have three random variables \\(X\\), \\(Y\\), and \\(Z\\) and that we observe realizations of these. Here is a plot of simulated observations for \\(X\\) and \\(Y\\) along with the sample correlation:\n\n\n\n\n\n\n\n\nYou can see that \\(X\\) and \\(Y\\) are negatively correlated. However, once we stratify by \\(Z\\) (shown in different colors below) another pattern emerges:\n\n\n\n\n\n\n\n\nIt is really \\(Z\\) that is negatively correlated with \\(X\\). If we stratify by \\(Z\\), the \\(X\\) and \\(Y\\) are actually positively correlated as seen in the plot above."
+    "title": "\n19  Association is not causation\n",
+    "section": "\n19.5 Simpson’s paradox",
+    "text": "19.5 Simpson’s paradox\nThe case we have just covered is an example of Simpson’s paradox. It is called a paradox because we see the sign of the correlation flip when comparing the entire publication to specific strata. As an illustrative example, suppose you have three random variables \\(X\\), \\(Y\\), and \\(Z\\), and we observe realizations of these. Here is a plot of simulated observations for \\(X\\) and \\(Y\\) along with the sample correlation:\n\n\n\n\n\n\n\n\nYou can see that \\(X\\) and \\(Y\\) are negatively correlated. However, once we stratify by \\(Z\\) (shown in different colors below), another pattern emerges:\n\n\n\n\n\n\n\n\nIt is really \\(Z\\) that is negatively correlated with \\(X\\). If we stratify by \\(Z\\), the \\(X\\) and \\(Y\\) are actually positively correlated, as seen in the plot above."
   },
   {
     "objectID": "linear-models/association-not-causation.html#exercises",
     "href": "linear-models/association-not-causation.html#exercises",
-    "title": "\n18  Association is not causation\n",
-    "section": "\n18.6 Exercises",
-    "text": "18.6 Exercises\nFor the next set of exercises, we examine the data from a 2014 PNAS paper3 that analyzed success rates from funding agencies in the Netherlands and concluded:\n\nOur results reveal gender bias favoring male applicants over female applicants in the prioritization of their “quality of researcher” (but not “quality of proposal”) evaluations and success rates, as well as in the language used in instructional and evaluation materials.\n\nA response4 was published a few months later titled No evidence that gender contributes to personal research funding success in The Netherlands: A reaction to Van der Lee and Ellemers which concluded:\n\nHowever, the overall gender effect borders on statistical significance, despite the large sample. Moreover, their conclusion could be a prime example of Simpson’s paradox; if a higher percentage of women apply for grants in more competitive scientific disciplines (i.e., with low application success rates for both men and women), then an analysis across all disciplines could incorrectly show “evidence” of gender inequality.\n\nWho is right here? The original paper or the response? Here, you will examine the data and come to your own conclusion.\n1. The main evidence for the conclusion of the original paper comes down to a comparison of the percentages. Table S1 in the paper includes the information we need:\n\nlibrary(dslabs)\nresearch_funding_rates\n\nConstruct the two-by-two table used for the conclusion about differences in awards by gender.\n2. Compute the difference in percentage from the two-by-two table.\n3. In the previous exercise, we noticed that the success rate is lower for women. But is it significant? Compute a p-value using a Chi-square test.\n4. We see that the p-value is about 0.05. So there appears to be some evidence of an association. But can we infer causation here? Is gender bias causing this observed difference? The response to the original paper claims that what we see here is similar to the UC Berkeley admissions example. Specifically they state that this “could be a prime example of Simpson’s paradox; if a higher percentage of women apply for grants in more competitive scientific disciplines, then an analysis across all disciplines could incorrectly show ‘evidence’ of gender inequality.” To settle this dispute, create a dataset with number of applications, awards, and success rate for each gender. Re-order the disciplines by their overall success rate. Hint: use the reorder function to re-order the disciplines in a first step, then use pivot_longer, separate, and pivot_wider to create the desired table.\n5. To check if this is a case of Simpson’s paradox, plot the success rates versus disciplines, which have been ordered by overall success, with colors to denote the genders and size to denote the number of applications.\n6. We definitely do not see the same level of confounding as in the UC Berkeley example. It is hard to say there is a confounder here. However, we do see that, based on the observed rates, some fields favor men and others favor women and we do see that the two fields with the largest difference favoring men are also the fields with the most applications. But, unlike the UC Berkeley example, women are not more likely to apply for the harder subjects. So perhaps some of the selection committees are biased and others are not.\nBut, before we conclude this, we must check if these differences are any different than what we get by chance. Are any of the differences seen above statistically significant? Keep in mind that even when there is no bias, we will see differences due to random variability in the review process as well as random variability across candidates. Perform a Chi-square test for each discipline. Hint: define a function that receives the total of a two-by-two table and returns a data frame with the p-value. Use the 0.5 correction. Then use the summarize function.\n7. For the medical sciences, there appears to be a statistically significant difference. But is this a spurious correlation? We performed 9 tests. Reporting only the one case with a p-value less than 0.05 might be considered an example of cherry picking. Repeat the exercise above, but instead of a p-value, compute a log odds ratio divided by their standard error. Then use qq-plot to see how much these log odds ratios deviate from the normal distribution we would expect: a standard normal distribution."
+    "title": "\n19  Association is not causation\n",
+    "section": "\n19.6 Exercises",
+    "text": "19.6 Exercises\nFor the next set of exercises, we examine the data from a 2014 PNAS paper3 that analyzed success rates from funding agencies in the Netherlands and concluded:\n\nOur results reveal gender bias favoring male applicants over female applicants in the prioritization of their “quality of researcher” (but not “quality of proposal”) evaluations and success rates, as well as in the language used in instructional and evaluation materials.\n\nA response4 was published a few months later titled No evidence that gender contributes to personal research funding success in The Netherlands: A reaction to Van der Lee and Ellemers which concluded:\n\nHowever, the overall gender effect borders on statistical significance, despite the large sample. Moreover, their conclusion could be a prime example of Simpson’s paradox; if a higher percentage of women apply for grants in more competitive scientific disciplines (i.e., with low application success rates for both men and women), then an analysis across all disciplines could incorrectly show “evidence” of gender inequality.\n\nWho is correct here, the original paper or the response? Below, you will examine the data and come to your own conclusion.\n1. The primary evidence for the conclusion of the original paper relies on a comparison of the percentages. Table S1 in the paper includes the information we need:\n\nlibrary(dslabs)\nresearch_funding_rates\n\nConstruct the two-by-two table used for the conclusion about differences in awards by gender.\n2. Compute the difference in percentage from the two-by-two table.\n3. In the previous exercise, we noticed that the success rate is lower for women. But is it significant? Compute a p-value using a Chi-square test.\n4. We see that the p-value is about 0.05. So there appears to be some evidence of an association. But can we infer causation here? Is gender bias causing this observed difference? The response to the original paper claims that what we see here is similar to the UC Berkeley admissions example. Specifically, they state that this “could be a prime example of Simpson’s paradox; if a higher percentage of women apply for grants in more competitive scientific disciplines, then an analysis across all disciplines could incorrectly show ‘evidence’ of gender inequality.” To settle this dispute, create a dataset with number of applications, awards, and success rate for each gender. Re-order the disciplines by their overall success rate. Hint: use the reorder function to re-order the disciplines in a first step, then use pivot_longer, separate, and pivot_wider to create the desired table.\n5. To check if this is a case of Simpson’s paradox, plot the success rates versus disciplines, which have been ordered by overall success, with colors to denote the genders and size to denote the number of applications.\n6. We definitely do not see the same level of confounding as in the UC Berkeley example. It is hard to say that there is a clear confounder here. However, we do see that, based on the observed rates, some fields favor men and others favor women. We also see that the two fields with the largest difference favoring men are also the fields with the most applications. But, unlike the UC Berkeley example, women are not more likely to apply for the harder subjects. Is it possible some of the selection committees are biased and others are not?\nTo answer this question we start by checking if any of the differences seen above are statistically significant. Remember that even when there is no bias, we will see differences due to random variability in the review process as well as random variability across candidates. Perform a Chi-square test for each discipline. Hint: define a function that receives the total of a two-by-two table and returns a data frame with the p-value. Use the 0.5 correction. Then use the summarize function.\n7. In the medical sciences, there appears to be a statistically significant difference, but could this be a spurious correlation? We performed 9 tests. Reporting only the one case with a p-value less than 0.05 might be considered an example of cherry picking. Repeat the exercise above, but instead of a p-value, compute a log odds ratio divided by their standard error. Then use qq-plot to see how much these log odds ratios deviate from the normal distribution we would expect: a standard normal distribution."
   },
   {
     "objectID": "linear-models/association-not-causation.html#footnotes",
     "href": "linear-models/association-not-causation.html#footnotes",
-    "title": "\n18  Association is not causation\n",
+    "title": "\n19  Association is not causation\n",
     "section": "",
     "text": "http://tylervigen.com/spurious-correlations↩︎\nhttps://opinionator.blogs.nytimes.com/2014/04/12/parental-involvement-is-overrated↩︎\nhttp://www.pnas.org/content/112/40/12349.abstract↩︎\nhttp://www.pnas.org/content/112/51/E7036.extract↩︎"
   },
@@ -907,699 +949,650 @@
     "href": "highdim/intro-highdim.html",
     "title": "High dimensional data",
     "section": "",
-    "text": "There is a variety of computational techniques and statistical concepts that are useful for analysis of datasets for which each observation is associated with a large number of numerical variables. In this chapter we provide a basic introduction to these techniques and concepts by describing matrix operations in R, dimension reduction, regularization, and matrix factorization. Handwritten digits data and movie recommendation systems serve as motivating examples.\nA task that serves as motivation for this part of the book is quantifying the similarity between any two observations. For example, we might want to know how much two handwritten digits look like each other. However, note that each observations is associated with \\(28 \\times 28 = 784\\) pixels so we can’t simply use subtraction as we would do if our data was one dimensional. Instead, we will define observations as points in a high-dimensional space and mathematically define a distance. Many machine learning techniques, discussed in the next part of the book, require this calculation.\nAdditionally, this part of the book discusses dimension reduction. Here we search of data summaries that result in more manageable lower dimension versions of the data, but preserve most or all the information we need. Here too we can use distance between observations as specific challenge: we will reduce the dimensions summarize the data into lower dimensions, but in a way that preserves the distance between any two observations. We use linear algebra as a mathematical foundation for all the techniques presented here."
+    "text": "There is a variety of computational techniques and statistical concepts that are useful for analysis of datasets for which each observation is associated with a large number of numerical variables. In this chapter, we provide a basic introduction to these techniques and concepts by describing matrix operations in R, dimension reduction, regularization, and matrix factorization. Handwritten digits data and movie recommendation systems serve as motivating examples.\nA task that serves as motivation for this part of the book is quantifying the similarity between any two observations. For example, we might want to know how much two handwritten digits look like each other. However, note that each observation is associated with \\(28 \\times 28 = 784\\) pixels so we can’t simply use subtraction as we would if our data was one dimensional. Instead, we will define observations as points in a high-dimensional space and mathematically define a distance. Many machine learning techniques, discussed in the next part of the book, require this calculation.\nAdditionally, this part of the book discusses dimension reduction. Here we search for data summaries that provide more manageable lower dimension versions of the data, but preserve most or all the information we need. We again use distance between observations as a specific example: we will summarize the data into lower dimensions, but in a way that preserves distance between any two observations. We use linear algebra as a mathematical foundation for all the techniques presented here."
   },
   {
     "objectID": "highdim/matrices-in-R.html#sec-mnist",
     "href": "highdim/matrices-in-R.html#sec-mnist",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.1 Case study: MNIST",
-    "text": "19.1 Case study: MNIST\nAn example comes from handwritten digits. The first step in handling mail received in the post office is sorting letters by zip code:\n\nIn the Machine Learning part of this book we will describe how we can build computer algorithms to read handwritten digits, which robots then use to sort the letters. To build these algorithms, we first need to collect data, which in this case is a high-dimensional dataset.\nThe MNIST dataset was generated by digitizing thousands of handwritten digits, already read and annotated by humans1. Below are three images of written digits.\n\n\n\n\n\n\n\n\nThe images are converted into \\(28 \\times 28 = 784\\) pixels and, for each pixel, we obtain a grey scale intensity between 0 (white) and 255 (black). The following plot shows the individual features for each image:\n\n\n\n\n\n\n\n\nFor each digitized image, indexed by \\(i\\), we are provided 784 variables and a categorical outcome, or label, representing which digit among \\(0, 1, 2, 3, 4, 5, 6, 7 , 8,\\) and \\(9\\) the image is representing. Let’s load the data using the dslabs package:\n\nlibrary(tidyverse)\nlibrary(dslabs)\nmnist &lt;- read_mnist()\n\nIn these cases, the pixel intensities are saved in a matrix:\n\nclass(mnist$train$images)\n#&gt; [1] \"matrix\" \"array\"\n\nThe labels associated with each image are included in a vector:\n\ntable(mnist$train$labels)\n#&gt; \n#&gt;    0    1    2    3    4    5    6    7    8    9 \n#&gt; 5923 6742 5958 6131 5842 5421 5918 6265 5851 5949\n\n\n19.1.1 Motivating tasks\nTo motivate the use of matrices in R, we will pose six tasks related to the handwritten digits data and then show the fast and simple code that solves them.\n1. Visualize the original image. The pixel intensities are provided as rows in a matrix. We will show how to conver these to a matrix that we can visualize.\n2. Do some digits require more ink to write than others? We will study the distribution of the total pixel darkness and how it varies by digits.\n3. Are some pixels uninformative? We will study the variation of each pixel across digits and remove predictors (columns) associated with pixels that don’t change much and thus can’t provide much information for classification.\n4. Can we remove smudges? We will first, look at the distribution of all pixel values. Then we will use this to pick a cutoff to define unwritten space. Then, set anything below that cutoff to 0.\n5. Binarize the data. First, we will look at the distribution of all pixel values. We will then use this to pick a cutoff to distinguish between writing and no writing. Then, we will convert all entries into either 1 or 0.\n6. Standardize the digits. We will scale each of the predictors in each entry to have the same average and standard deviation.\nTo complete these, we will have to perform mathematical operations involving several variables. The tidyverse or data.table are not developed to perform these types of mathematical operations. For this task, it is convenient to use matrices.\nTo simplify the code below, we will rename these x and y respectively:\n\nx &lt;- mnist$train$images\ny &lt;- mnist$train$labels"
+    "title": "\n20  Matrices in R\n",
+    "section": "\n20.1 Case study: MNIST",
+    "text": "20.1 Case study: MNIST\nThe first step in handling mail received in the post office is to sort letters by zip code:\n\nIn the Machine Learning part of this book, we will describe how we can build computer algorithms to read handwritten digits, which robots then use to sort the letters. To do this, we first need to collect data, which in this case is a high-dimensional dataset and best stored in a matrix.\nThe MNIST dataset was generated by digitizing thousands of handwritten digits, already read and annotated by humans1. Below are three images of written digits.\n\n\n\n\n\n\n\n\nThe images are converted into \\(28 \\times 28 = 784\\) pixels and, for each pixel, we obtain a grey scale intensity between 0 (white) and 255 (black). The following plot shows the individual features for each image:\n\n\n\n\n\n\n\n\nFor each digitized image, indexed by \\(i\\), we are provided with 784 variables and a categorical outcome, or label, representing the digit among \\(0, 1, 2, 3, 4, 5, 6, 7 , 8,\\) and \\(9\\) that the image is representing. Let’s load the data using the dslabs package:\n\nlibrary(tidyverse)\nlibrary(dslabs)\nmnist &lt;- read_mnist()\n\nIn these cases, the pixel intensities are saved in a matrix:\n\nclass(mnist$train$images)\n#&gt; [1] \"matrix\" \"array\"\n\nThe labels associated with each image are included in a vector:\n\ntable(mnist$train$labels)\n#&gt; \n#&gt;    0    1    2    3    4    5    6    7    8    9 \n#&gt; 5923 6742 5958 6131 5842 5421 5918 6265 5851 5949\n\n\n20.1.1 Motivating tasks\nTo motivate the use of matrices in R, we will pose six tasks related to the handwritten digits data and then show the fast and simple code that solves them.\n1. Visualize the original image. The pixel intensities are provided as rows in a matrix. We will show how to convert these to a matrix that we can visualize.\n2. Do some digits require more ink to write than others? We will study the distribution of the total pixel darkness and how it varies by digits.\n3. Are some pixels uninformative? We will study the variation of each pixel across digits and remove predictors (columns) associated with pixels that don’t change much and thus can’t provide much information for classification.\n4. Can we remove smudges? We will first look at the distribution of all pixel values. Next, we will use this to pick a cutoff to define unwritten space. Then, we set anything below that cutoff to 0.\n5. Binarize the data. First, we will look at the distribution of all pixel values. We will then use this to pick a cutoff to distinguish between writing and no writing. Afterward, we will convert all entries into either 1 or 0.\n6. Standardize the digits. We will scale each of the predictors in each entry to have the same average and standard deviation.\nTo complete these, we will have to perform mathematical operations involving several variables. The tidyverse or data.table are not developed to perform these types of mathematical operations. For this task, it is convenient to use matrices.\nTo simplify the code below, we will rename these x and y respectively:\n\nx &lt;- mnist$train$images\ny &lt;- mnist$train$labels"
   },
   {
-    "objectID": "highdim/matrices-in-R.html#sec-matrix-notation",
-    "href": "highdim/matrices-in-R.html#sec-matrix-notation",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.5 Mathematical notation",
-    "text": "19.5 Mathematical notation\nMatrices are usually represented with bold upper case letters:\n\\[\n\\mathbf{X} =\n\\begin{bmatrix}\nx_{1,1}&x_{1,2}&\\dots & x_{1,p}\\\\\nx_{2,1}&x_{2,2}&\\dots & x_{2,p}\\\\\n\\vdots & \\vdots & \\ddots & \\vdots\\\\\nx_{n,1}&x_{n,2}&\\dots&x_{n,p}\\\\\n\\end{bmatrix}\n\\]\nwith \\(x_{i,j}\\) representing the \\(j\\)-the feature for the \\(i\\)-th observation.\nWe denote vectors with lower case bold letters and represent them as one column matrices, often referred to as column vectors. R follows this convention when converting a vector to a matrix:\n\ndim(matrix(x[300,]))\n#&gt; [1] 784   1\n\nHowever, column vectors should not be confused with the columns of the matrix. They have this name simply because they have one column.\nMathematical descriptions of machine learning often make reference to vectors representing the \\(p\\) features:\n\\[\n\\mathbf{x} =\n\\begin{bmatrix}\nx_1\\\\\\\nx_2\\\\\\\n\\vdots\\\\\\\nx_p\n\\end{bmatrix}\n\\]\nTo distinguish between features associated with the observations \\(i=1,\\dots,n\\) we add an index:\n\\[\n\\mathbf{x}_i = \\begin{bmatrix}\nx_{i,1}\\\\\nx_{i,2}\\\\\n\\vdots\\\\\nx_{i,p}\n\\end{bmatrix}\n\\]\n\nBold lower case letter are also commonly used to represent matrix columns rather than rows. This can be confusing because \\(\\mathbf{x}_1\\) can represent either the first row or the first column of \\(\\mathbf{X}\\). One way to distinguish is using notation similar to computer code: using the colon \\(:\\) to represent all. So \\(\\mathbf{X}_{1,:}\\) is a row, the first row and all the columns, and \\(\\mathbf{X}_{:,1}\\) is a column, the first column and all the rows. Another approach is to distinguish by the letter used to index, with \\(i\\) used for rows and \\(j\\) used for columns. So \\(\\mathbf{x}_i\\) is the \\(i\\)th row and \\(\\mathbf{x}_j\\) is the \\(j\\)th column. With this approach it is important to clarify which dimension, row or column, is being represented. Further confusion can arise because, as discussed, it is common to represent all vectors as 1 column matrices, including the rows of a matrix."
+    "objectID": "highdim/matrices-in-R.html#dimensions-of-a-matrix",
+    "href": "highdim/matrices-in-R.html#dimensions-of-a-matrix",
+    "title": "\n20  Matrices in R\n",
+    "section": "\n20.2 Dimensions of a matrix",
+    "text": "20.2 Dimensions of a matrix\nThe dimension of a matrix is an important characteristic needed to assure that certain linear algebra operations can be performed. The dimension is a two-number summary defined as the number of rows \\(\\times\\) the number of columns.\nThe nrow function tells us how many rows that matrix has:\n\nnrow(x)\n#&gt; [1] 60000\n\nand ncol tells us how many columns:\n\nncol(x)\n#&gt; [1] 784\n\nWe learn that our dataset contains 60,000 observations (images) and 784 features (pixels).\nThe dim function returns the rows and columns:\n\ndim(x)\n#&gt; [1] 60000   784"
   },
   {
-    "objectID": "highdim/matrices-in-R.html#converting-vectors-to-a-matrices",
-    "href": "highdim/matrices-in-R.html#converting-vectors-to-a-matrices",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.3 Converting vectors to a matrices",
-    "text": "19.3 Converting vectors to a matrices\nVectors can be thought of as \\(n\\times 1\\) matrices. However, in R, a vector does not have dimensions:\n\ndim(x_1)\n#&gt; NULL\n\nVectors are not matrices in R. However, we can easily convert then to a matrix:\n\ndim(matrix(x_1))\n#&gt; [1] 5 1\n\nIt is also possible to change the dimensions of the resulting matrix. To see an example of how can this be useful, consider wanting to visualize the the rows pixel intensities in their original \\(28\\times28\\) grid.\nThe filling of matrces with values from a vector happens by column: the first column is filled first, then the second and so on. This example helps illustrate:\n\nmy_vector &lt;- 1:15\nmat &lt;- matrix(my_vector, 5, 3)\nmat\n#&gt;      [,1] [,2] [,3]\n#&gt; [1,]    1    6   11\n#&gt; [2,]    2    7   12\n#&gt; [3,]    3    8   13\n#&gt; [4,]    4    9   14\n#&gt; [5,]    5   10   15\n\nWe can fill by row by using the byrow argument:\n\nmat_t &lt;- matrix(my_vector, 3, 5, byrow = TRUE)\nmat_t\n#&gt;      [,1] [,2] [,3] [,4] [,5]\n#&gt; [1,]    1    2    3    4    5\n#&gt; [2,]    6    7    8    9   10\n#&gt; [3,]   11   12   13   14   15"
+    "objectID": "highdim/matrices-in-R.html#creating-a-matrix",
+    "href": "highdim/matrices-in-R.html#creating-a-matrix",
+    "title": "\n20  Matrices in R\n",
+    "section": "\n20.3 Creating a matrix",
+    "text": "20.3 Creating a matrix\nIn R, we can create a matrix using the matrix function. The first argument is a vector containing the elements that will fill up the matrix. The second and third arguments determine the number of row and columns, respectively. So a typical way to create a matrix is to first obtain a vector of numbers containing the elements of the matrix and feeding it to the matrix function. For example, to create a \\(100 \\times 2\\) matrix of normally distributed random variables, we write:\n\nz &lt;- matrix(rnorm(100*2), 100, 2)\n\nNote that by default the matrix is filled in column by column:\n\nmatrix(1:15, 3, 5)\n#&gt;      [,1] [,2] [,3] [,4] [,5]\n#&gt; [1,]    1    4    7   10   13\n#&gt; [2,]    2    5    8   11   14\n#&gt; [3,]    3    6    9   12   15\n\nTo fill the matrix row by row, we can use the byrow argument:\n\nmatrix(1:15, 3, 5, byrow = TRUE)\n#&gt;      [,1] [,2] [,3] [,4] [,5]\n#&gt; [1,]    1    2    3    4    5\n#&gt; [2,]    6    7    8    9   10\n#&gt; [3,]   11   12   13   14   15\n\nThe function as.vector converts a matrix back into a vector:\n\nas.vector(matrix(1:15, 3, 5))\n#&gt;  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15\n\n\nIf the product of columns and rows does not match the length of the vector provided in the first argument, matrix recycles values. If the length of the vector is a sub-multiple or multiple of the number of rows, this happens without warning:\n\nmatrix(1:3, 3, 5)\n#&gt;      [,1] [,2] [,3] [,4] [,5]\n#&gt; [1,]    1    1    1    1    1\n#&gt; [2,]    2    2    2    2    2\n#&gt; [3,]    3    3    3    3    3"
   },
   {
-    "objectID": "highdim/matrices-in-R.html#motivating-questions",
-    "href": "highdim/matrices-in-R.html#motivating-questions",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.4 Motivating questions",
-    "text": "19.4 Motivating questions\nTo motivate the use of matrices in R, we will pose five questions/challenges related to the handwritten digits data:\n1. Do some digits require more ink to write than others? We will study the distribution of the total pixel darkness and how it varies by digits.\n2. Are some pixels uninformative? We will study the variation of each pixel across digits and remove predictors (columns) associated with pixels that don’t change much and thus can’t provide much information for classification.\n3. Can we remove smudges? We will first, look at the distribution of all pixel values. Then we will use this to pick a cutoff to define unwritten space. Then, set anything below that cutoff to 0.\n4. Binarize the data. First, we will look at the distribution of all pixel values. We will then use this to pick a cutoff to distinguish between writing and no writing. Then, we will convert all entries into either 1 or 0.\n5. Standardize the digits. We will scale each of the predictors in each entry to have the same average and standard deviation.\nTo complete these, we will have to perform mathematical operations involving several variables. The tidyverse or data.table are not developed to perform these types of mathematical operations. For this task, it is convenient to use matrices.\n\n\n\n\n\n\nThe matrix function recycles values in the vector without warning if the product of columns and rows does not match the length of the vector:\n\nmatrix(1:3, 2, 5)\n#&gt; Warning in matrix(1:3, 2, 5): data length [3] is not a sub-multiple or\n#&gt; multiple of the number of rows [2]\n#&gt;      [,1] [,2] [,3] [,4] [,5]\n#&gt; [1,]    1    3    2    1    3\n#&gt; [2,]    2    1    3    2    1\n\n\n\n\nTo put the pixel intensities of our, say, 3rd entry, which is a 4 into grid, we can use:\n\ngrid &lt;- matrix(x[3,], 28, 28)\n\nTo confirm that in fact we have done this correctly, we can use the function image, which shows an image of its third argument. The top of this plot is pixel 1, which is shown at the bottom so the image is flipped. To code below includes code showing how to flip it back:\n\nimage(1:28, 1:28, grid)\nimage(1:28, 1:28, grid[, 28:1])"
+    "objectID": "highdim/matrices-in-R.html#subsetting",
+    "href": "highdim/matrices-in-R.html#subsetting",
+    "title": "\n20  Matrices in R\n",
+    "section": "\n20.4 Subsetting",
+    "text": "20.4 Subsetting\nTo extract a specific entry from a matrix, for example the 300th row of the 100th column, we write:\n\nx[300,100]\n\nWe can extract subsets of the matrices by using vectors of indexes. For example, we can extract the first 100 pixels from the first 300 observations like this:\n\nx[1:300,1:100]\n\nTo extract an entire row or subset of rows, we leave the column dimension blank. So the following code returns all the pixels for the first 300 observations:\n\nx[1:300,]\n\nSimilarly, we can subset any number of columns by keeping the first dimension blank. Here is the code to extract the first 100 pixels:\n\nx[,1:100]\n\n\n\n\n\n\n\nIf we subset just one row or just one column, the resulting object is no longer a matrix. For example:\n\ndim(x[300,])\n#&gt; NULL\n\nTo avoid this, we can use the drop argument:\n\ndim(x[100,,drop = FALSE])\n#&gt; [1]   1 784\n\n\n\n\nTask 1: Visualize the original image\nFor instance, let’s try to visualize the third observation. From the label, we know this is a:\n\nmnist$train$label[3]\n#&gt; [1] 4\n\nThe third row of the matrix x[3,] contains the 784 pixel intensities. We can assume these were entered in order and convert them back to a \\(28 \\times 28\\) matrix using:\n\ngrid &lt;- matrix(x[3,], 28, 28)\n\nTo visualize the data, we can use image in the followin way:\n\nimage(1:28, 1:28, grid)\n\nHowever, because the y-axis in image goes bottom to top and x stores pixels top to bottom the code above shows shows a flipped image. To flip it back we can use:\n\nimage(1:28, 1:28, grid[, 28:1])"
   },
   {
-    "objectID": "highdim/matrices-in-R.html#row-and-column-summaries",
-    "href": "highdim/matrices-in-R.html#row-and-column-summaries",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.7 Row and column summaries",
-    "text": "19.7 Row and column summaries\nA common operation with matrices is to apply the same function to each row or to each column. For example, we may want to compute row averages and standard deviations. The apply function lets you do this. The first argument is the matrix, the second is the dimension, 1 for rows, 2 for columns, and the third is the function to be applied.\nSo, for example, to compute the averages and standard deviations of each row we write:\n\navgs &lt;- apply(x, 1, mean)\nsds &lt;- apply(x, 1, sd)\n\nTo compute these for the columns we simply change the 1 to a 2:\n\navgs &lt;- apply(x, 1, mean)\nsds &lt;- apply(x, 1, sd)\n\nBecause these operations are so common, special functions are available to perform them. So, for example, the functions rowMeans computes the average of each row\n\navg &lt;- rowMeans(x)\n\nand the matrixStats function rowSds computes the standard deviations for each row:\n\nlibrary(matrixStats)\nsds &lt;- rowSds(x)\n\nThe functions colMeans, and colSds provide the version for columns. For more fast implementations look at the functions available in matrixStats.\nTask 2: Do some digits require more ink to write than others?\nFor the second task, related to total pixel darkness, we want to see the average use of ink plotted against digit. We have already computed this average and can generate a boxplot to answer the question:\n\navg &lt;- rowMeans(x)\nboxplot(avg ~ y)\n\n\n\n\n\n\n\nFrom this plot we see that, not surprisingly, 1s use less ink than the other digits."
+    "objectID": "highdim/matrices-in-R.html#sec-matrix-notation",
+    "href": "highdim/matrices-in-R.html#sec-matrix-notation",
+    "title": "\n20  Matrices in R\n",
+    "section": "\n20.5 Mathematical notation",
+    "text": "20.5 Mathematical notation\nMatrices are usually represented with bold upper case letters:\n\\[\n\\mathbf{X} =\n\\begin{bmatrix}\nx_{1,1}&x_{1,2}&\\dots & x_{1,p}\\\\\nx_{2,1}&x_{2,2}&\\dots & x_{2,p}\\\\\n\\vdots & \\vdots & \\ddots & \\vdots\\\\\nx_{n,1}&x_{n,2}&\\dots&x_{n,p}\\\\\n\\end{bmatrix}\n\\]\nwith \\(x_{i,j}\\) representing the \\(j\\)-the feature for the \\(i\\)-th observation.\nWe denote vectors with lower case bold letters and represent them as one column matrices, often referred to as column vectors. R follows this convention when converting a vector to a matrix:\n\ndim(matrix(x[300,]))\n#&gt; [1] 784   1\n\nHowever, column vectors should not be confused with the columns of the matrix. They have this name simply because they have one column.\nMathematical descriptions of machine learning often make reference to vectors representing the \\(p\\) features:\n\\[\n\\mathbf{x} =\n\\begin{bmatrix}\nx_1\\\\\\\nx_2\\\\\\\n\\vdots\\\\\\\nx_p\n\\end{bmatrix}\n\\]\nTo distinguish between features associated with the observations \\(i=1,\\dots,n\\), we add an index:\n\\[\n\\mathbf{x}_i = \\begin{bmatrix}\nx_{i,1}\\\\\nx_{i,2}\\\\\n\\vdots\\\\\nx_{i,p}\n\\end{bmatrix}\n\\]\n\nBold lower case letters are also commonly used to represent matrix columns rather than rows. This can be confusing because \\(\\mathbf{x}_1\\) can represent either the first row or the first column of \\(\\mathbf{X}\\). One way to distinguish is to use notation similar to computer code: using the colon \\(:\\) to represent all. So \\(\\mathbf{X}_{1,:}\\) represents the first row and \\(\\mathbf{X}_{:,1}\\) is the first column. Another approach is to distinguish by the letter used to index, with \\(i\\) used for rows and \\(j\\) used for columns. So \\(\\mathbf{x}_i\\) is the \\(i\\)th row and \\(\\mathbf{x}_j\\) is the \\(j\\)th column. With this approach, it is important to clarify which dimension, row or column is being represented. Further confusion can arise because, as aforementioned, it is common to represent all vectors, including the rows of a matrix, as one-column matrices."
+  },
+  {
+    "objectID": "highdim/matrices-in-R.html#the-transpose",
+    "href": "highdim/matrices-in-R.html#the-transpose",
+    "title": "\n20  Matrices in R\n",
+    "section": "\n20.6 The transpose",
+    "text": "20.6 The transpose\nA common operation when working with matrices is the transpose. We use the transpose to understand several concepts described in the next several sections. This operation simply converts the rows of a matrix into columns. We use the symbols \\(\\top\\) or \\('\\) next to the bold upper case letter to denote the transpose:\n\\[\n\\text{if } \\,\n\\mathbf{X} =\n\\begin{bmatrix}\n  x_{1,1}&\\dots & x_{1,p} \\\\\n  x_{2,1}&\\dots & x_{2,p} \\\\\n  \\vdots & \\ddots & \\vdots & \\\\\n  x_{n,1}&\\dots & x_{n,p}\n  \\end{bmatrix} \\text{ then }\\,\n\\mathbf{X}^\\top =\n\\begin{bmatrix}\n  x_{1,1}&x_{2,1}&\\dots & x_{n,1} \\\\\n  \\vdots & \\vdots & \\ddots & \\vdots \\\\\n  x_{1,p}&x_{2,p}&\\dots & x_{n,p}\n  \\end{bmatrix}\n\\]\nIn R we compute the transpose using the function t\n\ndim(x)\n#&gt; [1] 60000   784\ndim(t(x))\n#&gt; [1]   784 60000\n\nOne use of the transpose is that we can write the matrix \\(\\mathbf{X}\\) as rows of the column vectors representing the features for each individual observation in the following way:\n\\[\n\\mathbf{X} =\n\\begin{bmatrix}\n\\mathbf{x}_1^\\top\\\\\n\\mathbf{x}_2^\\top\\\\\n\\vdots\\\\\n\\mathbf{x}_n^\\top\n\\end{bmatrix}\n\\]"
   },
   {
-    "objectID": "highdim/matrices-in-R.html#apply",
-    "href": "highdim/matrices-in-R.html#apply",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.6 apply\n",
-    "text": "19.6 apply\n\nThe functions just described are performing an operation similar to what sapply and the purrr function map do: apply the same function to a part of your object. In this case, the function is applied to either each row or each column. The apply function lets you apply any function, not just sum or mean, to a matrix. The first argument is the matrix, the second is the dimension, 1 for rows, 2 for columns, and the third is the function. So, for example, rowMeans can be written as:\n\navgs &lt;- apply(x, 1, mean)\n\nBut notice that just like with sapply and map, we can perform any function. So if we wanted the standard deviation for each column, we could write:\n\nsds &lt;- apply(x, 2, sd)\n\nThe trade off for this flexibility is that these operations are not as fast as dedicated functions such as rowMeans."
+    "objectID": "highdim/matrices-in-R.html#row-and-column-summaries",
+    "href": "highdim/matrices-in-R.html#row-and-column-summaries",
+    "title": "\n20  Matrices in R\n",
+    "section": "\n20.7 Row and column summaries",
+    "text": "20.7 Row and column summaries\nA common operation with matrices is to apply the same function to each row or to each column. For example, we may want to compute row averages and standard deviations. The apply function lets you do this. The first argument is the matrix, the second is the dimension, 1 for rows, 2 for columns, and the third is the function to be applied.\nSo, for example, to compute the averages and standard deviations of each row, we write:\n\navgs &lt;- apply(x, 1, mean)\nsds &lt;- apply(x, 1, sd)\n\nTo compute these for the columns, we simply change the 1 to a 2:\n\navgs &lt;- apply(x, 1, mean)\nsds &lt;- apply(x, 1, sd)\n\nBecause these operations are so common, special functions are available to perform them. So, for example, the functions rowMeans computes the average of each row:\n\navg &lt;- rowMeans(x)\n\nand the matrixStats function rowSds computes the standard deviations for each row:\n\nlibrary(matrixStats)\nsds &lt;- rowSds(x)\n\nThe functions colMeans and colSds provide the version for columns. For more fast implementations consider the functions available in matrixStats.\nTask 2: Do some digits require more ink to write than others?\nFor the second task, related to total pixel darkness, we want to see the average use of ink plotted against digit. We have already computed this average and can generate a boxplot to answer the question:\n\navg &lt;- rowMeans(x)\nboxplot(avg ~ y)\n\n\n\n\n\n\n\nFrom this plot we see that, not surprisingly, 1s use less ink than other digits."
   },
   {
-    "objectID": "highdim/matrices-in-R.html#filtering-columns-based-on-summaries",
-    "href": "highdim/matrices-in-R.html#filtering-columns-based-on-summaries",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.7 Filtering columns based on summaries",
-    "text": "19.7 Filtering columns based on summaries\nWe now turn to task 2: studying the variation of each pixel and removing columns associated with pixels that don’t change much and thus do not inform the classification. Although a simplistic approach, we will quantify the variation of each pixel with its standard deviation across all entries. Since each column represents a pixel, we use the colSds function from the matrixStats package:\n\nlibrary(matrixStats)\n#&gt; \n#&gt; Attaching package: 'matrixStats'\n#&gt; The following object is masked from 'package:dplyr':\n#&gt; \n#&gt;     count\nsds &lt;- colSds(x)\n\nA quick look at the distribution of these values shows that some pixels have very low entry to entry variability:\n\n\n\n\n\n\n\n\n\nhist(sds, breaks = 30, main = \"SDs\")\n\nThis makes sense since we don’t write in some parts of the box. Here is the variance plotted by location:\n\nimage(1:28, 1:28, matrix(sds, 28, 28)[, 28:1])\n\n\n\n\n\n\n\n\n\nWe see that there is little variation in the corners.\nWe could remove features that have no variation since these can’t help us predict. We can extract columns from matrices using the following code:\n\nx[ ,c(351,352)]\n\nand rows like this:\n\nx[c(2,3),]\n\nWe can also use logical indexes to determine which columns or rows to keep. So if we wanted to remove uninformative predictors from our matrix, we could write this one line of code:\n\nnew_x &lt;- x[ ,colSds(x) &gt; 60]\ndim(new_x)\n#&gt; [1] 300 316\n\nOnly the columns for which the standard deviation is above 60 are kept, which removes over half the predictors.\nHere we add an important warning related to subsetting matrices: if you select one column or one row, the result is no longer a matrix but a vector.\n\nclass(x[, 1])\n#&gt; [1] \"integer\"\ndim(x[1, ])\n#&gt; NULL\n\nHowever, we can preserve the matrix class by using the argument drop=FALSE:\n\nclass(x[, 1, drop = FALSE])\n#&gt; [1] \"matrix\" \"array\"\ndim(x[, 1, drop = FALSE])\n#&gt; [1] 300   1"
+    "objectID": "highdim/matrices-in-R.html#conditional-filtering",
+    "href": "highdim/matrices-in-R.html#conditional-filtering",
+    "title": "\n20  Matrices in R\n",
+    "section": "\n20.8 Conditional filtering",
+    "text": "20.8 Conditional filtering\nOne of the advantages of matrices operations over tidyverse operations is that we can easily select columns based on summaries of the columns.\nNote that logical filters can be used to subset matrices in a similar way in which they can be used to subset vectors. Here is a simple example subsetting columns with logicals:\n\nmatrix(1:15, 3, 5)[,c(FALSE, TRUE, TRUE, FALSE, TRUE)]\n#&gt;      [,1] [,2] [,3]\n#&gt; [1,]    4    7   13\n#&gt; [2,]    5    8   14\n#&gt; [3,]    6    9   15\n\nThis implies that we can select rows with conditional expression. In the following example we remove all observations containing at least one NA:\n\nx[apply(!is.na(x), 1, all),]\n\nThis being a common operation, we have a matrixStats function to do it faster:\n\nx[!rowAnyNAs(x),]\n\nTask 3: Are some pixels uninformative?\nWe can use these ideas to remove columns associated with pixels that don’t change much and thus do not inform digit classification. We will quantify the variation of each pixel with its standard deviation across all entries. Since each column represents a pixel, we use the colSds function from the matrixStats package:\n\nsds &lt;- colSds(x)\n\nA quick look at the distribution of these values shows that some pixels have very low entry to entry variability:\n\n\n\n\n\n\n\n\n\nhist(sds, breaks = 30, main = \"SDs\")\n\nThis makes sense since we don’t write in some parts of the box. Here is the variance plotted by location:\n\nimage(1:28, 1:28, matrix(sds, 28, 28)[, 28:1])\n\n\n\n\n\n\n\n\n\nWe see that there is little variation in the corners.\nWe could remove features that have no variation since these can’t help us predict.\nSo if we wanted to remove uninformative predictors from our matrix, we could write this one line of code:\n\nnew_x &lt;- x[,colSds(x) &gt; 60]\ndim(new_x)\n#&gt; [1] 60000   322\n\nOnly the columns for which the standard deviation is above 60 are kept, which removes over half the predictors."
   },
   {
     "objectID": "highdim/matrices-in-R.html#indexing-with-matrices",
     "href": "highdim/matrices-in-R.html#indexing-with-matrices",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.9 Indexing with matrices",
-    "text": "19.9 Indexing with matrices\nA operation that facilitates efficient coding is that we can change entries of a matrix based on conditionals applied to that same matrix. Here is a simple example:\nTo see what this does, we look at a smaller matrix:\n\nmat &lt;- matrix(1:15, 3, 5)\nmat[mat &gt; 6 & mat &lt; 12] &lt;- 0\nmat\n#&gt;      [,1] [,2] [,3] [,4] [,5]\n#&gt; [1,]    1    4    0    0   13\n#&gt; [2,]    2    5    0    0   14\n#&gt; [3,]    3    6    0   12   15\n\nA useful application of this approach is that we can change all the NA entries of a matrix to something else:\n\nx[!is.na(x)] &lt;- 0\n\n\n19.9.1 Task 4: Can we remove smudges?\nA histogram of all our predictor data:\n\n\n\n\n\n\n\n\n\nhist(as.vector(x), breaks = 30, main = \"Pixel intensities\")\n\nshows a clear dichotomy which is explained as parts of the image with ink and parts without. If we think that values below, say, 50 are smudges, we can quickly make them zero using:\n\nnew_x &lt;- x\nnew_x[new_x &lt; 50] &lt;- 0\n\nTask 5: Binarizing the data\nThe histogram above seems to suggest that this data is mostly binary. A pixel either has ink or does not. Using what we have learned, we can binarize the data using just matrix operations:\n\nbin_x &lt;- x\nbin_x[bin_x &lt; 255/2] &lt;- 0 \nbin_x[bin_x &gt; 255/2] &lt;- 1\n\nWe can also convert to a matrix of logicals and then coerce to numbers like this:\n\nbin_X &lt;- (x &gt; 255/2)*1"
-  },
-  {
-    "objectID": "highdim/matrices-in-R.html#binarizing-the-data",
-    "href": "highdim/matrices-in-R.html#binarizing-the-data",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.9 Binarizing the data",
-    "text": "19.9 Binarizing the data\nThe histogram above seems to suggest that this data is mostly binary. A pixel either has ink or does not. Using what we have learned, we can binarize the data using just matrix operations:\n\nbin_x &lt;- x\nbin_x[bin_x &lt; 255/2] &lt;- 0 \nbin_x[bin_x &gt; 255/2] &lt;- 1\n\nWe can also convert to a matrix of logicals and then coerce to numbers like this:\n\nbin_X &lt;- (x &gt; 255/2)*1"
+    "title": "\n20  Matrices in R\n",
+    "section": "\n20.9 Indexing with matrices",
+    "text": "20.9 Indexing with matrices\nAn operation that facilitates efficient coding is that we can change entries of a matrix based on conditionals applied to that same matrix. Here is a simple example:\n\nmat &lt;- matrix(1:15, 3, 5)\nmat[mat &gt; 6 & mat &lt; 12] &lt;- 0\nmat\n#&gt;      [,1] [,2] [,3] [,4] [,5]\n#&gt; [1,]    1    4    0    0   13\n#&gt; [2,]    2    5    0    0   14\n#&gt; [3,]    3    6    0   12   15\n\nA useful application of this approach is that we can change all the NA entries of a matrix to something else:\n\nx[!is.na(x)] &lt;- 0\n\n\n20.9.1 Task 4: Can we remove smudges?\nA histogram of all our predictor data:\n\n\n\n\n\n\n\n\n\nhist(as.vector(x), breaks = 30, main = \"Pixel intensities\")\n\nshows a clear dichotomy which is explained as parts of the image with ink and parts without. If we think that values below, say, 50 are smudges, we can quickly make them zero using:\n\nnew_x &lt;- x\nnew_x[new_x &lt; 50] &lt;- 0\n\nTask 5: Binarizing the data\nThe histogram above seems to suggest that this data is mostly binary. A pixel either has ink or does not. Applying what we have learned, we can binarize the data using just matrix operations:\n\nbin_x &lt;- x\nbin_x[bin_x &lt; 255/2] &lt;- 0 \nbin_x[bin_x &gt; 255/2] &lt;- 1\n\nWe can also convert to a matrix of logicals and then coerce to numbers like this:\n\nbin_X &lt;- (x &gt; 255/2)*1"
   },
   {
     "objectID": "highdim/matrices-in-R.html#vectorization-for-matrices",
     "href": "highdim/matrices-in-R.html#vectorization-for-matrices",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.10 Vectorization for matrices",
-    "text": "19.10 Vectorization for matrices\nIn R, if we subtract a vector from a matrix, the first element of the vector is subtracted from the first row, the second element from the second row, and so on. Using mathematical notation, we would write it as follows:\n\\[\n\\begin{bmatrix}\n  X_{1,1}&\\dots & X_{1,p} \\\\\n  X_{2,1}&\\dots & X_{2,p} \\\\\n   & \\vdots & \\\\\n  X_{n,1}&\\dots & X_{n,p}\n  \\end{bmatrix}\n-\n\\begin{bmatrix}\na_1\\\\\\\na_2\\\\\\\n\\vdots\\\\\\\na_n\n\\end{bmatrix}\n=\n\\begin{bmatrix}\n  X_{1,1}-a_1&\\dots & X_{1,p} -a_1\\\\\n  X_{2,1}-a_2&\\dots & X_{2,p} -a_2\\\\\n   & \\vdots & \\\\\n  X_{n,1}-a_n&\\dots & X_{n,p} -a_n\n  \\end{bmatrix}\n\\]\nThe same holds true for other arithmetic operations.\nThe function sweep facilitates this type of operation. It works similarly to apply. It takes each entry of a vector and applies an arithmetic operation to the corresponding row. Subtraction is the default artihmetic operation. So, for example, to center each row around the avarage we can use:\n\nsweep(x, 1, rowMeans(x))\n\nTask 6: Standardize the digits\nThe way R vectorizes arithmetic opearions implies that we can scale each row of a matrix like this:\n\n(x - rowMeans(x))/rowSds(x)\n\nIf you want to scale each column, be careful since this approach does not work for columns. For columns we can sweep:\n\nx_mean_0 &lt;- sweep(x, 2, colMeans(x))\n\nTo divide by the standard deviation, we change the default arithmetic operation to division as follows:\n\nx_standardized &lt;- sweep(x_mean_0, 2, colSds(x), FUN = \"/\")\n\nIn R, if you add, subtract, multiple or divide two matrices, the operation is done elementwise. For example, if two matrices are stored in x and y, then\n\nx*y\n\ndoes not result in matrix multiplication. Instead, the entry in row \\(i\\) and column \\(j\\) of this product is the product of the entryin row \\(i\\) and column \\(j\\) of x and y, respectively."
+    "title": "\n20  Matrices in R\n",
+    "section": "\n20.10 Vectorization for matrices",
+    "text": "20.10 Vectorization for matrices\nIn R, if we subtract a vector from a matrix, the first element of the vector is subtracted from the first row, the second element from the second row, and so on. Using mathematical notation, we would write it as follows:\n\\[\n\\begin{bmatrix}\n  X_{1,1}&\\dots & X_{1,p} \\\\\n  X_{2,1}&\\dots & X_{2,p} \\\\\n   & \\vdots & \\\\\n  X_{n,1}&\\dots & X_{n,p}\n  \\end{bmatrix}\n-\n\\begin{bmatrix}\na_1\\\\\\\na_2\\\\\\\n\\vdots\\\\\\\na_n\n\\end{bmatrix}\n=\n\\begin{bmatrix}\n  X_{1,1}-a_1&\\dots & X_{1,p} -a_1\\\\\n  X_{2,1}-a_2&\\dots & X_{2,p} -a_2\\\\\n   & \\vdots & \\\\\n  X_{n,1}-a_n&\\dots & X_{n,p} -a_n\n  \\end{bmatrix}\n\\]\nThe same holds true for other arithmetic operations.\nThe function sweep facilitates this type of operation. It works similarly to apply. It takes each entry of a vector and applies an arithmetic operation to the corresponding row. Subtraction is the default arithmetic operation. So, for example, to center each row around the average, we can use:\n\nsweep(x, 1, rowMeans(x))\n\nTask 6: Standardize the digits\nThe way R vectorizes arithmetic operations implies that we can scale each row of a matrix as follows:\n\n(x - rowMeans(x))/rowSds(x)\n\nYet this approach does not work for columns. For columns, we can sweep:\n\nx_mean_0 &lt;- sweep(x, 2, colMeans(x))\n\nTo divide by the standard deviation, we change the default arithmetic operation to division as follows:\n\nx_standardized &lt;- sweep(x_mean_0, 2, colSds(x), FUN = \"/\")\n\nIn R, if you add, subtract, multiple or divide two matrices, the operation is done elementwise. For example, if two matrices are stored in x and y, then:\n\nx*y\n\ndoes not result in matrix multiplication. Instead, the entry in row \\(i\\) and column \\(j\\) of this product is the product of the entry in row \\(i\\) and column \\(j\\) of x and y, respectively."
   },
   {
     "objectID": "highdim/matrices-in-R.html#exercises",
     "href": "highdim/matrices-in-R.html#exercises",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.11 Exercises",
-    "text": "19.11 Exercises\n1. Create a 100 by 10 matrix of randomly generated normal numbers. Put the result in x.\n2. Apply the three R functions that give you the dimension of x, the number of rows of x, and the number of columns of x, respectively.\n3. Add the scalar 1 to row 1, the scalar 2 to row 2, and so on, to the matrix x.\n4. Add the scalar 1 to column 1, the scalar 2 to column 2, and so on, to the matrix x. Hint: use sweep with FUN = \"+\".\n5. Compute the average of each row of x.\n6. Compute the average of each column of x.\n7. For each digit in the MNIST training data, compute the proportion of pixels that are in a grey area, defined as values between 50 and 205. Make boxplot by digit class. Hint: use logical operators and rowMeans."
+    "title": "\n20  Matrices in R\n",
+    "section": "\n20.11 Exercises",
+    "text": "20.11 Exercises\n1. Create a 100 by 10 matrix of randomly generated normal numbers. Put the result in x.\n2. Apply the three R functions that give you the dimension of x, the number of rows of x, and the number of columns of x, respectively.\n3. Add the scalar 1 to row 1, the scalar 2 to row 2, and so on, to the matrix x.\n4. Add the scalar 1 to column 1, the scalar 2 to column 2, and so on, to the matrix x. Hint: Use sweep with FUN = \"+\".\n5. Compute the average of each row of x.\n6. Compute the average of each column of x.\n7. For each digit in the MNIST training data, compute the proportion of pixels that are in a grey area, defined as values between 50 and 205. Make a boxplot by digit class. Hint: Use logical operators and rowMeans."
   },
   {
     "objectID": "highdim/matrices-in-R.html#footnotes",
     "href": "highdim/matrices-in-R.html#footnotes",
-    "title": "\n19  Matrices in R\n",
+    "title": "\n20  Matrices in R\n",
     "section": "",
     "text": "http://yann.lecun.com/exdb/mnist/↩︎"
   },
-  {
-    "objectID": "highdim/linear-algebra.html#transpose-of-a-matrix",
-    "href": "highdim/linear-algebra.html#transpose-of-a-matrix",
-    "title": "20  Applied Linear Algebra",
-    "section": "\n20.1 Transpose of a matrix",
-    "text": "20.1 Transpose of a matrix\nA common operation when working with matrices is the transpose. We will see examples in later sections. This operation simply converts the rows of a matrix into columns. We use the symbols \\(\\top\\) or \\('\\) next to the bold upper case letter to denote the transpose:\n\\[\n\\mathbf{X}^\\top =\n\\begin{pmatrix}\n  x_{1,1}&x_{2,1}&\\dots & x_{n,1} \\\\\n  x_{1,2}&x_{2,2}&\\dots & x_{n,2} \\\\\n  \\vdots & \\vdots & \\ddots & \\vdots & \\\\\n  x_{1,p}&x_{2,p}&\\dots & x_{n,p}\n  \\end{pmatrix}\n\\] If we are writing out a column, such as \\(\\mathbf{x}_1\\) defined above, in a sentence we often use the notation: \\(\\mathbf{x}_1 = ( x_{1,1}, \\dots x_{n,1})^\\top\\) to avoid wasting vertical space in the text.\nIn R we compute the transpose using the function t\n\ndim(x)\n#&gt; [1] 300 784\ndim(t(x))\n#&gt; [1] 784 300"
-  },
   {
     "objectID": "highdim/linear-algebra.html#matrix-multiplication",
     "href": "highdim/linear-algebra.html#matrix-multiplication",
-    "title": "20  Applied Linear Algebra",
-    "section": "\n20.1 Matrix multiplication",
-    "text": "20.1 Matrix multiplication\nA commonly used operation in data analysis is matrix multiplication. Here we define and motivate the operation.\nLinear algebra was born from mathematicians developing systematic ways to solve systems of linear equations, for example\n\\[\n\\begin{align}\nx +  3 y  - 2 z  &= 5\\\\\n3x + 5y + 6z &= 7\\\\\n2x + 4y + 3z &= 8.\n\\end{align}\n\\]\nMathematicians figured out that by representing these linear systems of equations using matrices and vectors, predefined algorithms could be designed to solve any system of linear equations. A basic linear algebra class will teach some of these algorithms, such as Gaussian elimination, the Gauss-Jordan elimination, and the LU and QR decompositions. These methods are usually covered in detail in university level linear algebra courses.\nTo explain matrix multiplication, define two matrices \\(\\mathbf{A}\\) and \\(\\mathbf{B}\\)\n\\[\n\\mathbf{A} =\n\\begin{pmatrix}\na_{11}&a_{12}&\\dots&a_{1n}\\\\\na_{21}&a_{22}&\\dots&a_{2n}\\\\\n\\vdots&\\vdots&\\ddots&\\vdots\\\\\na_{m1}&a_{2}&\\dots&a_{mn}\n\\end{pmatrix}, \\,\n\\mathbf{B} = \\begin{pmatrix}\nb_{11}&b_{12}&\\dots&b_{1p}\\\\\nb_{21}&b_{22}&\\dots&b_{2p}\\\\\n\\vdots&\\vdots&\\ddots&\\vdots\\\\\nb_{n1}&b_{n2}&\\dots&b_{np}\n\\end{pmatrix}\n\\]\nand define the product of matrices \\(\\mathbf{A}\\) and \\(\\mathbf{B}\\) as the matrix \\(\\mathbf{C} = \\mathbf{A}\\mathbf{B}\\) that has entries \\(c_{ij}\\) equal to the sum of the component-wise product of the \\(i\\)th row of \\(\\mathbf{A}\\) with the \\(j\\)th column of \\(\\mathbf{B}\\). Using R code we can define \\(\\mathbf{C}= \\mathbf{A}\\mathbf{B}\\) as follows:\n\nm &lt;- nrow(A)\np &lt;- ncol(B)\nC &lt;- matrix(0, m, p)\nfor(i in 1:m){\n  for(j in 1:p){\n    C[i,j] &lt;- sum(A[i,] * B[,j])\n  }\n}\n\nBecause this operation is so common, R includes a mathematical operator %*% for matrix multiplication:\n\nC &lt;- A %*% B\n\nUsing mathematical notation \\(\\mathbf{C} = \\mathbf{A}\\mathbf{B}\\) looks like this:\n\\[\n\\begin{pmatrix}\na_{11}b_{11} + \\dots + a_{1n}b_{n1}&\na_{11}b_{12} + \\dots + a_{1n}b_{n2}&\n\\dots&\na_{11}b_{1p} + \\dots + a_{1n}b_{np}\\\\\na_{21}b_{11} + \\dots + a_{2n}b_{n1}&\na_{21}b_{n2} + \\dots + a_{2n}b_{n2}&\n\\dots&\na_{21}b_{1p} + \\dots + a_{2n}b_{np}\\\\\n\\vdots&\\vdots&\\ddots&\\vdots\\\\\na_{m1}b_{11} + \\dots +a_{mn}b_{n1}&\na_{m1}b_{n2} + \\dots + a_{mn}b_{n2}&\n\\dots&\na_{m1}b_{1p} + \\dots + a_{mn}b_{np}\\\\\n\\end{pmatrix}\n\\]\nNote this definition implies that the multiplication \\(\\mathbf{A}\\mathbf{B}\\) is only possible when the number of rows of \\(\\mathbf{A}\\) matches the number of columns of \\(\\mathbf{B}\\).\nSo how does this definition of matrix multiplication help solve systems of equations? First, any system of equations with unknowns \\(x_1, \\dots x_n\\)\n\\[\n\\begin{align}\na_{11} x_1 + a_{12} x_2 \\dots + a_{1n}x_n &= b_1\\\\\na_{21} x_1 + a_{22} x_2 \\dots + a_{2n}x_n &= b_2\\\\\n\\vdots\\\\\na_{n1} x_1 + a_{n2} x_2 \\dots + a_{nn}x_n &= b_n\\\\\n\\end{align}\n\\]\ncan now be represented as matrix multiplication by defining the following matrices:\n\\[\n\\mathbf{A} =\\begin{pmatrix}\na_{11}&a_{12}&\\dots&a_{1n}\\\\\na_{21}&a_{22}&\\dots&a_{2n}\\\\\n\\vdots&\\vdots&\\ddots&\\vdots\\\\\na_{m1}&a_{22}&\\dots&a_{nn}\n\\end{pmatrix}\n,\\,\n\\mathbf{b} =\n\\begin{pmatrix}\nb_1\\\\\nb_2\\\\\n\\vdots\\\\\nb_n\n\\end{pmatrix}\n,\\, \\mbox{ and }\n\\mathbf{x} =\n\\begin{pmatrix}\nx_1\\\\\nx_2\\\\\n\\vdots\\\\\nx_n\n\\end{pmatrix}\n\\]\nand rewriting the equation simply as\n\\[\n\\mathbf{A}\\mathbf{x} =  \\mathbf{b}\n\\]\nThe linear algebra algorithms listed above, such as Gaussian elimination, provide a way to compute the inverse matrix \\(A^{-1}\\) that solves the equation for \\(\\mathbf{x}\\):\n\\[\n\\mathbf{A}^{-1}\\mathbf{A}\\mathbf{x} =   \\mathbf{x} = \\mathbf{A}^{-1} \\mathbf{b}\n\\]\nTo solve the first equation we wrote out in R, we can use the function solve:\n\nA &lt;- matrix(c(1, 3, -2, 3, 5, 6, 2, 4, 3), 3, 3, byrow = TRUE)\nb &lt;- matrix(c(5, 7, 8))\nsolve(A, b)"
+    "title": "21  Applied Linear Algebra",
+    "section": "\n21.1 Matrix multiplication",
+    "text": "21.1 Matrix multiplication\nA commonly used operation in data analysis is matrix multiplication. Here, we define and motivate the operation.\nLinear algebra originated from mathematicians developing systematic ways to solve systems of linear equations. For example:\n\\[\n\\begin{aligned}\nx +  3 y  - 2 z  &= 5\\\\\n3x + 5y + 6z &= 7\\\\\n2x + 4y + 3z &= 8.\n\\end{aligned}\n\\]\nMathematicians figured out that by representing these linear systems of equations using matrices and vectors, predefined algorithms could be designed to solve any system of linear equations. A basic linear algebra class will teach some of these algorithms, such as Gaussian elimination, the Gauss-Jordan elimination, and the LU and QR decompositions. These methods are usually covered in detail in university level linear algebra courses.\nTo explain matrix multiplication, define two matrices: \\(\\mathbf{A}\\) and \\(\\mathbf{B}\\)\n\\[\n\\mathbf{A} =\n\\begin{pmatrix}\na_{11}&a_{12}&\\dots&a_{1n}\\\\\na_{21}&a_{22}&\\dots&a_{2n}\\\\\n\\vdots&\\vdots&\\ddots&\\vdots\\\\\na_{m1}&a_{2}&\\dots&a_{mn}\n\\end{pmatrix}, \\,\n\\mathbf{B} = \\begin{pmatrix}\nb_{11}&b_{12}&\\dots&b_{1p}\\\\\nb_{21}&b_{22}&\\dots&b_{2p}\\\\\n\\vdots&\\vdots&\\ddots&\\vdots\\\\\nb_{n1}&b_{n2}&\\dots&b_{np}\n\\end{pmatrix}\n\\]\nand define the product of matrices \\(\\mathbf{A}\\) and \\(\\mathbf{B}\\) as the matrix \\(\\mathbf{C} = \\mathbf{A}\\mathbf{B}\\) that has entries \\(c_{ij}\\) equal to the sum of the component-wise product of the \\(i\\)th row of \\(\\mathbf{A}\\) with the \\(j\\)th column of \\(\\mathbf{B}\\). Using R code, we can define \\(\\mathbf{C}= \\mathbf{A}\\mathbf{B}\\) as follows:\n\nm &lt;- nrow(A)\np &lt;- ncol(B)\nC &lt;- matrix(0, m, p)\nfor(i in 1:m){\n  for(j in 1:p){\n    C[i,j] &lt;- sum(A[i,] * B[,j])\n  }\n}\n\nBecause this operation is so common, R includes a mathematical operator %*% for matrix multiplication:\n\nC &lt;- A %*% B\n\nUsing mathematical notation \\(\\mathbf{C} = \\mathbf{A}\\mathbf{B}\\) looks like this:\n\\[\n\\begin{pmatrix}\na_{11}b_{11} + \\dots + a_{1n}b_{n1}&\na_{11}b_{12} + \\dots + a_{1n}b_{n2}&\n\\dots&\na_{11}b_{1p} + \\dots + a_{1n}b_{np}\\\\\na_{21}b_{11} + \\dots + a_{2n}b_{n1}&\na_{21}b_{n2} + \\dots + a_{2n}b_{n2}&\n\\dots&\na_{21}b_{1p} + \\dots + a_{2n}b_{np}\\\\\n\\vdots&\\vdots&\\ddots&\\vdots\\\\\na_{m1}b_{11} + \\dots +a_{mn}b_{n1}&\na_{m1}b_{n2} + \\dots + a_{mn}b_{n2}&\n\\dots&\na_{m1}b_{1p} + \\dots + a_{mn}b_{np}\\\\\n\\end{pmatrix}\n\\]\nNote this definition implies that the multiplication \\(\\mathbf{A}\\mathbf{B}\\) is only possible when the number of rows of \\(\\mathbf{A}\\) matches the number of columns of \\(\\mathbf{B}\\).\nSo how does this definition of matrix multiplication help solve systems of equations? First, any system of equations with unknowns \\(x_1, \\dots x_n\\)\n\\[\n\\begin{aligned}\na_{11} x_1 + a_{12} x_2 \\dots + a_{1n}x_n &= b_1\\\\\na_{21} x_1 + a_{22} x_2 \\dots + a_{2n}x_n &= b_2\\\\\n\\vdots\\\\\na_{n1} x_1 + a_{n2} x_2 \\dots + a_{nn}x_n &= b_n\\\\\n\\end{aligned}\n\\]\ncan now be represented as matrix multiplication by defining the following matrices:\n\\[\n\\mathbf{A} =\\begin{pmatrix}\na_{11}&a_{12}&\\dots&a_{1n}\\\\\na_{21}&a_{22}&\\dots&a_{2n}\\\\\n\\vdots&\\vdots&\\ddots&\\vdots\\\\\na_{m1}&a_{22}&\\dots&a_{nn}\n\\end{pmatrix}\n,\\,\n\\mathbf{b} =\n\\begin{pmatrix}\nb_1\\\\\nb_2\\\\\n\\vdots\\\\\nb_n\n\\end{pmatrix}\n,\\, \\mbox{ and }\n\\mathbf{x} =\n\\begin{pmatrix}\nx_1\\\\\nx_2\\\\\n\\vdots\\\\\nx_n\n\\end{pmatrix}\n\\]\nand rewriting the equation simply as:\n\\[\n\\mathbf{A}\\mathbf{x} =  \\mathbf{b}\n\\]\nThe linear algebra algorithms listed above, such as Gaussian elimination, provide a way to compute the inverse matrix \\(A^{-1}\\) that solves the equation for \\(\\mathbf{x}\\):\n\\[\n\\mathbf{A}^{-1}\\mathbf{A}\\mathbf{x} =   \\mathbf{x} = \\mathbf{A}^{-1} \\mathbf{b}\n\\]\nTo solve the first equation we wrote out in R, we can use the function qr.solve:\n\nA &lt;- matrix(c(1, 3, -2, 3, 5, 6, 2, 4, 3), 3, 3, byrow = TRUE)\nb &lt;- matrix(c(5, 7, 8))\nsolve(A, b)\n\n\n\n\n\n\n\nThe function solve works well when dealing with small to medium-sized matrices with a similar range for each column and not too many 0s. The function qr.solve can be used when this is not the case."
   },
   {
     "objectID": "highdim/linear-algebra.html#the-identity-matrix",
     "href": "highdim/linear-algebra.html#the-identity-matrix",
-    "title": "20  Applied Linear Algebra",
-    "section": "\n20.2 The identity matrix",
-    "text": "20.2 The identity matrix\nThe identity matrix, represented with a bold \\(\\mathbf{I}\\), is like the number 1, but for matrices: if you multiply a matrix by the identity matrix, you get back the matrix.\n\\[\n\\mathbf{I}\\mathbf{x} = \\mathbf{x}\n\\] If you do some math with the definition of matrix multiplication you will realize that \\(\\mathbf{1}\\) is a matrix with the same number of rows and columns (refereed to as square matrix) with 0s everywhere except the diagonal:\n\\[\n\\mathbf{I}=\\begin{pmatrix}\n1&0&\\dots&0\\\\\n0&1&\\dots&0\\\\\n\\vdots&\\vdots&\\ddots&\\vdots\\\\\n0&0&\\dots&1\n\\end{pmatrix}\n\\] It also implies that due to the definition of an inverse matrix we have\n\\[\n\\mathbf{A}^{-1}\\mathbf{A} = \\mathbf{1}\n\\]\nBecause the default for the second argument in solve is an identity matrix, if we simply type solve(A), we obtain the inverse \\(\\mathbf{A}^{-1}\\). This means we can also obtain a solution to our system of equations with:\n\nsolve(A) %*% b"
+    "title": "21  Applied Linear Algebra",
+    "section": "\n21.2 The identity matrix",
+    "text": "21.2 The identity matrix\nThe identity matrix, represented with a bold \\(\\mathbf{I}\\), is like the number 1, but for matrices: if you multiply a matrix by the identity matrix, you get back the matrix.\n\\[\n\\mathbf{I}\\mathbf{X} = \\mathbf{X}\n\\]\nIf you define \\(\\mathbf{I}\\) as matrix with the same number of rows and columns (referred to as square matrix) with 0s everywhere except the diagonal:\n\\[\n\\mathbf{I}=\\begin{pmatrix}\n1&0&\\dots&0\\\\\n0&1&\\dots&0\\\\\n\\vdots&\\vdots&\\ddots&\\vdots\\\\\n0&0&\\dots&1\n\\end{pmatrix}\n\\]\nyou will obtain the desired property.\nNote that the definition of an inverse matrix implies that:\n\\[\n\\mathbf{A}^{-1}\\mathbf{A} = \\mathbf{1}\n\\]\nBecause the default for the second argument in solve is an identity matrix, if we simply type solve(A), we obtain the inverse \\(\\mathbf{A}^{-1}\\). This means we can also obtain a solution to our system of equations with:\n\nsolve(A) %*% b"
   },
   {
     "objectID": "highdim/linear-algebra.html#distance",
     "href": "highdim/linear-algebra.html#distance",
-    "title": "20  Applied Linear Algebra",
-    "section": "\n20.3 Distance",
-    "text": "20.3 Distance\nMany of the analyses we perform with high-dimensional data relate directly or indirectly to distance. For example, most machine learning techniques rely on being able to define distances between observations, using features or predictors. Clustering algorithms, for example, search of observations that are similar. But what does this mean mathematically?\nTo define distance, we introduce another linear algebra concept: the norm. Recall that a point in two dimensions can represented in polar coordinates as:\n\n\n\n\n\n\n\n\nwith \\(\\theta = \\arctan{\\frac{x2}{x1}}\\) and \\(r = \\sqrt{x_1^2 + x_2^2}\\). If we think of the point as two dimensional column vector \\(\\mathbf{x} = (x_1, x_2)^\\top\\), \\(r\\) defines the norm of \\(\\mathbf{x}\\). The norm can be thought of as the size of the two-dimensional vector disregarding the direction: if we change the angle, the vector changes but the size does not. The point of defining the norm is that we can extrapolated the concept of size to higher dimensions. Specifically, we write the norm for any vector \\(\\mathbf{x}\\) as:\n\\[\n||\\mathbf{x}|| = \\sqrt{x_1^2 + x_2^2 + \\dots + x_p^2}\n\\]\nWe can use the linear algebra concepts we have learned to define the norm like this:\n\\[\n||\\mathbf{x}||^2 = \\mathbf{x}^\\top\\mathbf{x}\n\\]\nTo define distance, suppose we have two two-dimensional points \\(\\mathbf{x}_1\\) and \\(\\mathbf{x}_2\\). We can define how similar they are by simply using euclidean distance.\n\n\n\n\n\n\n\n\nWe know that the distance is equal to the length of the hypotenuse:\n\\[\n\\sqrt{(x_{11} - x_{12})^2 + (x_{21} - x_{22})^2}\n\\]\nThe reason we introduced the norm is because this distance is the size of the vector between the two points and this can be extrapolated to any dimension. The distance between two points, regardless of the dimensions, is defined as the norm of the difference:\n\\[\n|| \\mathbf{x}_1 - \\mathbf{x}_2||.\n\\]\nIf we use the digit data, the distance between the first and second observation will compute distance using all 784 features:\n\\[\n|| \\mathbf{x}_1 - \\mathbf{x}_2 || = \\sqrt{ \\sum_{j=1}^{784} (x_{1,j}-x_{2,j })^2 }\n\\]\nTo demonstrate, let’s pick the features for three digits:\n\nx_1 &lt;- x[6,]\nx_2 &lt;- x[17,]\nx_3 &lt;- x[16,]\n\nWe can compute the distances between each pair using the definitions we just learned:\n\nc(sum((x_1 - x_2)^2), sum((x_1 - x_3)^2), sum((x_2 - x_3)^2)) |&gt; sqrt()\n#&gt; [1] 2320 2331 2519\n\nIn R, the function crossprod(x) is convenient for computing norms it multiplies t(x) by x\n\nc(crossprod(x_1 - x_2), crossprod(x_1 - x_3), crossprod(x_2 - x_3)) |&gt; sqrt()\n#&gt; [1] 2320 2331 2519\n\nNote crossprod takes a matrix as the first argument and therefore the vectors used here are being coerced into single column matrices. Also note that crossprod(x,y) multiples t(x) by y.\nWe can see that the distance is smaller between the first two. This is in agreement with the fact that the first two are 2s and the third is a 7.\n\ny[c(6, 17, 16)]\n#&gt; [1] 2 2 7\n\nWe can also compute all the distances at once relatively quickly using the function dist, which computes the distance between each row and produces an object of class dist:\n\nd &lt;- dist(x[c(6,17,16),])\nclass(d)\n#&gt; [1] \"dist\"\n\nThere are several machine learning related functions in R that take objects of class dist as input. To access the entries using row and column indices, we need to coerce it into a matrix. We can see the distance we calculated above like this:\n\nd\n#&gt;      1    2\n#&gt; 2 2320     \n#&gt; 3 2331 2519\n\nWe can quickly see an image of the distances between observations using this function. As an example, we compute the distance between each of the first 300 observations and then make an image:\n\nd &lt;- dist(x[1:300,])\nimage(as.matrix(d))\n\nIf we order this distance by the labels, we can see yellowish squares near the diagonal. This is because observations from the same digits tend to be closer than to different digits:\n\nimage(as.matrix(d)[order(y[1:300]), order(y[1:300])])"
+    "title": "21  Applied Linear Algebra",
+    "section": "\n21.3 Distance",
+    "text": "21.3 Distance\nMany of the analyses we perform with high-dimensional data relate directly or indirectly to distance. For example, most machine learning techniques rely on being able to define distances between observations, using features or predictors. Clustering algorithms, for example, search of observations that are similar. But what does this mean mathematically?\nTo define distance, we introduce another linear algebra concept: the norm. Recall that a point in two dimensions can be represented in polar coordinates as:\n\n\n\n\n\n\n\n\nwith \\(\\theta = \\arctan{\\frac{x2}{x1}}\\) and \\(r = \\sqrt{x_1^2 + x_2^2}\\). If we think of the point as two dimensional column vector \\(\\mathbf{x} = (x_1, x_2)^\\top\\), \\(r\\) defines the norm of \\(\\mathbf{x}\\). The norm can be thought of as the size of the two-dimensional vector disregarding the direction: if we change the angle, the vector changes but the size does not. The point of defining the norm is that we can extrapolate the concept of size to higher dimensions. Specifically, we write the norm for any vector \\(\\mathbf{x}\\) as:\n\\[\n||\\mathbf{x}|| = \\sqrt{x_1^2 + x_2^2 + \\dots + x_p^2}\n\\]\nWe can use the linear algebra concepts we have learned to define the norm like this:\n\\[\n||\\mathbf{x}||^2 = \\mathbf{x}^\\top\\mathbf{x}\n\\]\nTo define distance, suppose we have two two-dimensional points: \\(\\mathbf{x}_1\\) and \\(\\mathbf{x}_2\\). We can define how similar they are by simply using euclidean distance:\n\n\n\n\n\n\n\n\nWe know that the distance is equal to the length of the hypotenuse:\n\\[\n\\sqrt{(x_{11} - x_{12})^2 + (x_{21} - x_{22})^2}\n\\]\nThe reason we introduced the norm is because this distance is the size of the vector between the two points and this can be extrapolated to any dimension. The distance between two points, regardless of the dimensions, is defined as the norm of the difference:\n\\[\n|| \\mathbf{x}_1 - \\mathbf{x}_2||.\n\\]\nIf we use the digit data, the distance between the first and second observation will compute distance using all 784 features:\n\\[\n|| \\mathbf{x}_1 - \\mathbf{x}_2 || = \\sqrt{ \\sum_{j=1}^{784} (x_{1,j}-x_{2,j })^2 }\n\\]\nTo demonstrate, let’s pick the features for three digits:\n\nx_1 &lt;- x[6,]\nx_2 &lt;- x[17,]\nx_3 &lt;- x[16,]\n\nWe can compute the distances between each pair using the definitions we just learned:\n\nc(sum((x_1 - x_2)^2), sum((x_1 - x_3)^2), sum((x_2 - x_3)^2)) |&gt; sqrt()\n#&gt; [1] 2320 2331 2519\n\nIn R, the function crossprod(x) is convenient for computing norms. It multiplies t(x) by x:\n\nc(crossprod(x_1 - x_2), crossprod(x_1 - x_3), crossprod(x_2 - x_3)) |&gt; sqrt()\n#&gt; [1] 2320 2331 2519\n\nNote crossprod takes a matrix as the first argument. As a result, the vectors used here are being coerced into single column matrices. Also, note that crossprod(x,y) multiples t(x) by y.\nWe can see that the distance is smaller between the first two. This agrees with the fact that the first two are 2s and the third is a 7.\n\ny[c(6, 17, 16)]\n#&gt; [1] 2 2 7\n\nWe can also compute all the distances at once relatively quickly using the function dist, which computes the distance between each row and produces an object of class dist:\n\nd &lt;- dist(x[c(6,17,16),])\nclass(d)\n#&gt; [1] \"dist\"\n\nThere are several machine learning related functions in R that take objects of class dist as input. To access the entries using row and column indices, we need to coerce it into a matrix. We can see the distance we calculated above like this:\n\nd\n#&gt;      1    2\n#&gt; 2 2320     \n#&gt; 3 2331 2519\n\nThe image function allows us to quickly see an image of distances between observations. As an example, we compute the distance between each of the first 300 observations and then make an image:\n\nd &lt;- dist(x[1:300,])\nimage(as.matrix(d))\n\nIf we order this distance by the labels, we can see yellowish squares near the diagonal. This is because observations from the same digits tend to be closer than to different digits:\n\nimage(as.matrix(d)[order(y[1:300]), order(y[1:300])])"
   },
   {
     "objectID": "highdim/linear-algebra.html#sec-predictor-space",
     "href": "highdim/linear-algebra.html#sec-predictor-space",
-    "title": "20  Applied Linear Algebra",
-    "section": "\n20.4 Spaces",
-    "text": "20.4 Spaces\nPredictor space is a concept that is often used to describe machine learning algorithms. The term space refers to an advanced mathematical definition for which we provide a simplified explanation to help understand the term predictor space when used in the context of machine learning algorithms.\nWe can think of all predictors \\((x_{i,1}, \\dots, x_{i,p})^\\top\\) for all observations \\(i=1,\\dots,n\\) as \\(n\\) \\(p\\)-dimensional points. A space can be thought of as the collection of all possible points that should be considered for the data analysis in question. This includes points we could see, but have not been observed yet. In the case of the handwritten digits, we can think of the predictor space as any point \\((x_{1}, \\dots, x_{p})^\\top\\) as long as each entry \\(x_i, \\, i = 1, \\dots, p\\) is between 0 and 255.\nSome Machine Learning algorithms also define subspaces. A common approach is to define neighborhoods of points that are close to a center. We can do this by selecting a center \\(\\mathbf{x}_0\\), a minimum distance \\(r\\), and defining the subspace as the collection of points \\(\\mathbf{x}\\) that satisfy\n\\[\n|| \\mathbf{x} - \\mathbf{x}_0 || \\leq r.\n\\]\nWe can think of this subspace as a multidimensional sphere since every point is the same distance away from the center.\nOther machine learning algorithms partition the predictor space into non-overlapping regions and then make different predictions for each region using the data in the region. We will learn about these in Section 29.5."
+    "title": "21  Applied Linear Algebra",
+    "section": "\n21.4 Spaces",
+    "text": "21.4 Spaces\nPredictor space is a concept that is often used to describe machine learning algorithms. The term space refers to an advanced mathematical definition for which we provide a simplified explanation to help understand the term predictor space when used in the context of machine learning algorithms.\nWe can think of all predictors \\((x_{i,1}, \\dots, x_{i,p})^\\top\\) for all observations \\(i=1,\\dots,n\\) as \\(n\\) \\(p\\)-dimensional points. A space can be thought of as the collection of all possible points that should be considered for the data analysis in question. This includes points we could see, but have not been observed yet. In the case of the handwritten digits, we can think of the predictor space as any point \\((x_{1}, \\dots, x_{p})^\\top\\) as long as each entry \\(x_i, \\, i = 1, \\dots, p\\) is between 0 and 255.\nSome Machine Learning algorithms also define subspaces. A commonly defined subspace in machine learning are neighborhoods composed of points that are close to a predetermined center. We do this by selecting a center \\(\\mathbf{x}_0\\), a minimum distance \\(r\\), and defining the subspace as the collection of points \\(\\mathbf{x}\\) that satisfy:\n\\[\n|| \\mathbf{x} - \\mathbf{x}_0 || \\leq r.\n\\]\nWe can think of this subspace as a multidimensional sphere since every point is the same distance away from the center.\nOther machine learning algorithms partition the predictor space into non-overlapping regions and then make different predictions for each region using the data in the region. We will learn about these in Section 30.4."
   },
   {
     "objectID": "highdim/linear-algebra.html#exercises",
     "href": "highdim/linear-algebra.html#exercises",
-    "title": "20  Applied Linear Algebra",
-    "section": "\n20.5 Exercises",
-    "text": "20.5 Exercises\n1. Generate two matrix, A and B, containing randomly generated and normally distributed numbers. The dimensions of these two matrices should \\(4 \\times 3\\) and \\(3 \\times 6\\), respectively. Confirm that C &lt;- A %*% B produce the same results as:\n\nm &lt;- nrow(A)\np &lt;- ncol(B)\nC &lt;- matrix(0, m, p)\nfor(i in 1:m){\n  for(j in 1:p){\n    C[i,j] &lt;- sum(A[i,] * B[,j])\n  }\n}\n\n2. Solve the following system of equations using R:\n\\[\n\\begin{align}\nx + y + z + w &= 10\\\\\n2x + 3y - z - w &= 5\\\\\n3x - y + 4z - 2w &= 15\\\\\n2x + 2y - 2z - 2w &= 20\\\\\n\\end{align}\n\\]\n3. Define x\n\nmnist &lt;- read_mnist()\nx &lt;- mnist$train$images[1:300,] \ny &lt;- mnist$train$labels[1:300]\n\nand compute the distance matrix\n\nd &lt;- dist(x)\nclass(d)\n\nGenerate a boxplot showing the distances for the second row of d stratified by digits. Do not include the distance to itself which we know it is 0. Can you predict what digit is represented by the second row of x?\n4. Use the apply function and matrix algebra to compute the distance between the second digit mnist$train$images[4,] and all other digits represented in mnist$train$images. Then generate as boxplot as in exercise 2 and predict what digit is the fourth row.\n5. Compute the distance between each feature and the feature representing the middle pixel (row 14 column 14). Create an image plot of where the distance is shown with color in the pixel position."
+    "title": "21  Applied Linear Algebra",
+    "section": "\n21.5 Exercises",
+    "text": "21.5 Exercises\n1. Generate two matrix, A and B, containing randomly generated and normally distributed numbers. The dimensions of these two matrices should be \\(4 \\times 3\\) and \\(3 \\times 6\\), respectively. Confirm that C &lt;- A %*% B produces the same results as:\n\nm &lt;- nrow(A)\np &lt;- ncol(B)\nC &lt;- matrix(0, m, p)\nfor(i in 1:m){\n  for(j in 1:p){\n    C[i,j] &lt;- sum(A[i,] * B[,j])\n  }\n}\n\n2. Solve the following system of equations using R:\n\\[\n\\begin{aligned}\nx + y + z + w &= 10\\\\\n2x + 3y - z - w &= 5\\\\\n3x - y + 4z - 2w &= 15\\\\\n2x + 2y - 2z - 2w &= 20\\\\\n\\end{aligned}\n\\]\n3. Define x:\n\nmnist &lt;- read_mnist()\nx &lt;- mnist$train$images[1:300,] \ny &lt;- mnist$train$labels[1:300]\n\nand compute the distance matrix:\n\nd &lt;- dist(x)\nclass(d)\n\nGenerate a boxplot showing the distances for the second row of d stratified by digits. Do not include the distance to itself, which we know is 0. Can you predict what digit is represented by the second row of x?\n4. Use the apply function and matrix algebra to compute the distance between the second digit mnist$train$images[4,] and all other digits represented in mnist$train$images. Then generate a boxplot as in exercise 2 and predict what digit is the fourth row.\n5. Compute the distance between each feature and the feature representing the middle pixel (row 14 column 14). Create an image plot of where the distance is shown with color in the pixel position."
   },
   {
     "objectID": "highdim/dimension-reduction.html#motivation-preserving-distance",
     "href": "highdim/dimension-reduction.html#motivation-preserving-distance",
-    "title": "\n21  Dimension reduction\n",
-    "section": "\n21.1 Motivation: preserving distance",
-    "text": "21.1 Motivation: preserving distance\nWe consider an example with twin heights. Some pairs are adults, the others are children. Here we simulate 100 two-dimensional points that represent the number of standard deviations each individual is from the mean height. Each point is a pair of twins. We use the mvrnorm function from the MASS package to simulate bivariate normal data.\n\nset.seed(1983)\nlibrary(MASS)\nn &lt;- 100\nrho &lt;- 0.9\nsigma &lt;- 3\ns &lt;- sigma^2*matrix(c(1, rho, rho, 1), 2, 2)\nx &lt;- rbind(mvrnorm(n/2, c(69, 69), s),\n           mvrnorm(n/2, c(60, 60), s))\n\nA scatterplot quickly reveals that the correlation is high and that there are two groups of twins, the adults (upper right points) and the children (lower left points):\n\n\n\n\n\n\n\n\nOur features are \\(n\\) two-dimensional points, the two heights. For illustrative purposes, we will act as if visualizing two dimensions is too challenging and we want to explore the data through a histogram of a one-dimensional variable. We therefore want to reduce the dimensions from two to one, but still be able to understand important characteristics of the data, for example that the observations cluster into two groups: adults and children. To show the ideas presented here are generally useful, we will standardize the data so that observations are in standard units rather than inches:\n\nlibrary(matrixStats)\n#&gt; \n#&gt; Attaching package: 'matrixStats'\n#&gt; The following object is masked from 'package:dplyr':\n#&gt; \n#&gt;     count\nx &lt;- sweep(x, 2, colMeans(x))\nx &lt;- sweep(x, 2, colSds(x), \"/\")\n\nIn the figure above we show the distance between observation 1 and 2 (blue), and observation 1 and 51 (red). Note that the blue line is shorter, which implies 1 and 2 are closer.\nWe can compute these distances using dist:\n\nd &lt;- dist(x)\nas.matrix(d)[1, 2]\n#&gt; [1] 0.595\nas.matrix(d)[2, 51]\n#&gt; [1] 1.39\n\nThis distance is based on two dimensions and we need a distance approximation based on just one.\nLet’s start with the naive approach of simply removing one of the two dimensions. Let’s compare the actual distances to the distances computed with just the first dimension:\n\nz &lt;- x[,1]\n\nTo make the distances comparable, we divide the sum of squares by the number of dimensions being added. So for the two dimensional case we have\n\\[\n\\sqrt{ \\frac{1}{2} \\sum_{j=1}^2 (x_{1,j}-x_{2,j})^2 },\n\\]\nso to make the distances comparable we divide by \\(\\sqrt{2}\\):\n\nplot(dist(x) / sqrt(2), dist(z))\nabline(0, 1, col = \"red\")\n\n\n\n\n\n\n\n\n\nThis one number summary does ok at preserving distances, but, can we pick a one-dimensional summary that makes the approximation even better?\nIf we look back at the scatterplot and visualize a line between any pair of points, the length of this line is the distance between the two points. These lines tend to go along the direction of the diagonal. We will learn that we can rotate the points in a way that preserve the distance between points, while increasing the variability in one dimension and reducing it on the other. By doing this, we keep more of the information about distances in the first dimension. In the next section we describe a mathematical approach that permits us to find rotations that preserve distance between points. We can then find the rotation that maximizes the variability in the first dimension."
+    "title": "\n22  Dimension reduction\n",
+    "section": "\n22.1 Motivation: preserving distance",
+    "text": "22.1 Motivation: preserving distance\nWe consider an example with twin heights. Some pairs are adults, the others are children. Here we simulate 100 two-dimensional points that represent the number of standard deviations each individual is from the mean height. Each point is a pair of twins. We use the mvrnorm function from the MASS package to simulate bivariate normal data.\n\nset.seed(1983)\nlibrary(MASS)\nn &lt;- 100\nrho &lt;- 0.9\nsigma &lt;- 3\ns &lt;- sigma^2*matrix(c(1, rho, rho, 1), 2, 2)\nx &lt;- rbind(mvrnorm(n/2, c(69, 69), s),\n           mvrnorm(n/2, c(60, 60), s))\n\nA scatterplot quickly reveals that the correlation is high and that there are two groups of twins, the adults (upper right points) and the children (lower left points):\n\n\n\n\n\n\n\n\nOur features are \\(n\\) two-dimensional points, the two heights. For illustrative purposes, we will pretend that visualizing two dimensions is too challenging and we want to explore the data through a histogram of a one-dimensional variable. We therefore want to reduce the dimensions from two to one, but still be able to understand important characteristics of the data, in particular that the observations cluster into two groups: adults and children. To show the ideas presented here are generally useful, we will standardize the data so that observations are in standard units rather than inches:\n\nlibrary(matrixStats)\nx &lt;- sweep(x, 2, colMeans(x))\nx &lt;- sweep(x, 2, colSds(x), \"/\")\n\nIn the figure above, we show the distance between observation 1 and 2 (blue), and observation 1 and 51 (red). Note that the blue line is shorter, which implies that 1 and 2 are closer.\nWe can compute these distances using dist:\n\nd &lt;- dist(x)\nas.matrix(d)[1, 2]\n#&gt; [1] 0.595\nas.matrix(d)[2, 51]\n#&gt; [1] 1.39\n\nThis distance is based on two dimensions and we need a distance approximation based on just one.\nLet’s start with the naive approach of simply removing one of the two dimensions. Let’s compare the actual distances to the distances computed with just the first dimension:\n\nz &lt;- x[,1]\n\nTo make the distances comparable, we divide the sum of squares by the number of dimensions being added. So for the two dimensional case, we have:\n\\[\n\\sqrt{ \\frac{1}{2} \\sum_{j=1}^2 (x_{1,j}-x_{2,j})^2 },\n\\]\nTo make the distances comparable, we divide by \\(\\sqrt{2}\\):\n\nplot(dist(x) / sqrt(2), dist(z))\nabline(0, 1, col = \"red\")\n\n\n\n\n\n\n\n\n\nThis one number summary does ok at preserving distances, but, can we pick a one-dimensional summary that improves the approximation?\nIf we look back at the scatterplot and visualize a line between any pair of points, the length of this line is the distance between the two points. These lines tend to go along the direction of the diagonal. We will learn that we can rotate the points in a way that preserve the distance between points, while increasing the variability in one dimension and reducing it on the other. Using this method, we keep more of the information about distances in the first dimension. In the next section, we describe a mathematical approach that permits us to find rotations that preserve distance between points. We can then find the rotation that maximizes the variability in the first dimension."
   },
   {
     "objectID": "highdim/dimension-reduction.html#rotations",
     "href": "highdim/dimension-reduction.html#rotations",
-    "title": "\n21  Dimension reduction\n",
-    "section": "\n21.2 Rotations",
-    "text": "21.2 Rotations\nAny two-dimensional point \\((x_1, x_2)^\\top\\) can be written as the base and height of a triangle with a hypotenuse going from \\((0,0)^\\top\\) to \\((x_1, x_2)^\\top\\):\n\\[\nx_1 = r \\cos\\phi, \\,\\, x_2 = r \\sin\\phi\n\\]\nwith \\(r\\) the length of the hypotenuse and \\(\\phi\\) the angel between the hypotenuse and the x-axis.\nWe can rotate the point \\((x_1, x_2)^\\top\\) around a circle with center \\((0,0)^\\top\\) and radius \\(r\\) by an angle \\(\\theta\\) by changing the angle in the previous equation to \\(\\phi + \\theta\\):\n\\[\nz_1 = r \\cos(\\phi+ \\theta), \\,\\,\nz_2 = r \\sin(\\phi + \\theta)\n\\]\n\n\n\n\n\n\n\n\nWe can use trigonometric identities to rewrite \\((z_1, z_2)\\) in the following way:\n\\[\n\\begin{align}\nz_1 = r \\cos(\\phi + \\theta) = r \\cos \\phi \\cos\\theta -  r \\sin\\phi \\sin\\theta =  x_1 \\cos(\\theta) -  x_2 \\sin(\\theta)\\\\\nz_2 = r \\sin(\\phi + \\theta) =  r \\cos\\phi \\sin\\theta + r \\sin\\phi \\cos\\theta =  x_1 \\sin(\\theta) + x_2 \\cos(\\theta)\n\\end{align}\n\\]\nNow we can rotate each point in the dataset by simply applying the formula above to each pair \\((x_{i,1}, x_{i,2})^\\top\\). Here is what the twin standardized heights look like after rotating each point by \\(-45\\) degrees:\n\n\n\n\n\n\n\n\nNote that while the variability of \\(x_1\\) and \\(x_2\\) are similar, the variability of \\(z_1\\) is much larger than the variability of \\(z_2\\). Also note that the distances between points appear to be preserved. In the next sections, we show, mathematically, that this in fact the case."
+    "title": "\n22  Dimension reduction\n",
+    "section": "\n22.2 Rotations",
+    "text": "22.2 Rotations\nAny two-dimensional point \\((x_1, x_2)^\\top\\) can be written as the base and height of a triangle with a hypotenuse going from \\((0,0)^\\top\\) to \\((x_1, x_2)^\\top\\):\n\\[\nx_1 = r \\cos\\phi, \\,\\, x_2 = r \\sin\\phi\n\\]\nwith \\(r\\) the length of the hypotenuse and \\(\\phi\\) the angle between the hypotenuse and the x-axis.\nTo rotate the point \\((x_1, x_2)^\\top\\) around a circle with center \\((0,0)^\\top\\) and radius \\(r\\) by an angle \\(\\theta\\) we simply change the angle in the previous equation to \\(\\phi + \\theta\\):\n\\[\nz_1 = r \\cos(\\phi+ \\theta), \\,\\,\nz_2 = r \\sin(\\phi + \\theta)\n\\]\n\n\n\n\n\n\n\n\nWe can use trigonometric identities to rewrite \\((z_1, z_2)\\) as follows:\n\\[\n\\begin{aligned}\nz_1 = r \\cos(\\phi + \\theta) = r \\cos \\phi \\cos\\theta -  r \\sin\\phi \\sin\\theta =  x_1 \\cos(\\theta) -  x_2 \\sin(\\theta)\\\\\nz_2 = r \\sin(\\phi + \\theta) =  r \\cos\\phi \\sin\\theta + r \\sin\\phi \\cos\\theta =  x_1 \\sin(\\theta) + x_2 \\cos(\\theta)\n\\end{aligned}\n\\]\nNow we can rotate each point in the dataset by simply applying the formula above to each pair \\((x_{i,1}, x_{i,2})^\\top\\). Here is what the twin standardized heights look like after rotating each point by \\(-45\\) degrees:\n\n\n\n\n\n\n\n\nNote that while the variability of \\(x_1\\) and \\(x_2\\) are similar, the variability of \\(z_1\\) is much larger than the variability of \\(z_2\\). Also, notice that the distances between points appear to be preserved. In the next sections, we show mathematically that this in fact the case."
   },
   {
     "objectID": "highdim/dimension-reduction.html#linear-transformations",
     "href": "highdim/dimension-reduction.html#linear-transformations",
-    "title": "\n21  Dimension reduction\n",
-    "section": "\n21.3 Linear transformations",
-    "text": "21.3 Linear transformations\nAny time a matrix \\(\\mathbf{X}\\) is multiplied by another matrix \\(\\mathbf{A}\\), we refer to the product \\(\\mathbf{Z} = \\mathbf{X}\\mathbf{A}\\) as a linear transformation of \\(\\mathbf{X}\\). Below we show that the rotations described above are a linear transformation. To see this, note that for any row \\(i\\), the first entry was:\n\\[z_{i,1} = a_{1,1} x_{i,1} + a_{2,1} x_{i,2}\\]\nwith \\(a_{1,1} = \\cos\\theta\\) and \\(a_{2,1} = -\\sin\\theta\\).\nThe second entry was also a linear transformation:\n\\[z_{i,2} = a_{1,2} x_{i,1} + a_{2,2} x_{i,2}\\]\nwith \\(a_{1,2} = \\sin\\theta\\) and \\(a_{2,2} = \\cos\\theta\\).\nWe can write these equations using matrix notation:\n\\[\n\\begin{pmatrix}\nz_1\\\\z_2\n\\end{pmatrix}\n=\n\\begin{pmatrix}\na_{1,1}&a_{1,2}\\\\\na_{2,1}&a_{2,2}\n\\end{pmatrix}^\\top\n\\begin{pmatrix}\nx_1\\\\x_2\n\\end{pmatrix}\n\\]\nAn advantage of using linear algebra is that we can write the transformation for the entire dataset by saving all observations in a \\(N \\times 2\\) matrix\n\\[\n\\mathbf{X} \\equiv\n\\begin{bmatrix}\n\\mathbf{x_1}^\\top\\\\\n\\vdots\\\\\n\\mathbf{x_n}^\\top\n\\end{bmatrix} =\n\\begin{bmatrix}\nx_{1,1}&x_{1,2}\\\\\n\\vdots&\\vdots\\\\\nx_{n,1}&x_{n,2}\n\\end{bmatrix}\n\\]\nWe can then obtained the rotated values \\(\\mathbf{z}_i\\) for each row \\(i\\) by applying a linear transformation of \\(X\\):\n\\[\n\\mathbf{Z} = \\mathbf{X} \\mathbf{A}\n\\mbox{ with }\n\\mathbf{A} = \\,\n\\begin{pmatrix}\na_{1,1}&a_{1,2}\\\\\na_{2,1}&a_{2,2}\n\\end{pmatrix} =\n\\begin{pmatrix}\n\\cos \\theta&\\sin \\theta\\\\\n-\\sin \\theta&\\cos \\theta\n\\end{pmatrix}\n.\n\\]\nIf we define\n\ntheta &lt;- 2*pi*-45/360 #convert to radians\nA &lt;- matrix(c(cos(theta), -sin(theta), sin(theta), cos(theta)), 2, 2)\n\nWe can write code implementing a rotation by any angle \\(\\theta\\) using linear algebra:\n\nrotate &lt;- function(x, theta){\n  theta &lt;- 2*pi*theta/360\n  A &lt;- matrix(c(cos(theta), -sin(theta), sin(theta), cos(theta)), 2, 2)\n  x %*% A\n}\n\nThe columns of \\(\\mathbf{A}\\) are referred to as directions because if we draw a vector from \\((0,0)\\) to \\((a_{1,j}, a_{2,j})\\) it points in the direction of the line that will become the \\(j-th\\) dimension.\nAnother advantage of linear algebra is that if we can find the inverse matrix of \\(\\mathbf{A}^\\top\\) we can convert \\(\\mathbf{Z}\\) back to \\(\\mathbf{X}\\) again using a linear transformation.\nIn this particular case we can use trigonometry to show that\n\\[\nx_{i,1} = b_{1,1} z_{i,1} + b_{2,1} z_{i,2}\\\\\nx_{i,2} = b_{1,2} z_{i,1} + b_{2,2} z_{i,2}\n\\]\nwith \\(b_{2,1} = \\cos\\theta\\), \\(b_{2,1} = \\sin\\theta\\), \\(b_{1,2} = -\\sin\\theta\\), and \\(b_{2,2} = \\cos\\theta\\).\nThis implies that\n\\[\n\\mathbf{X} = \\mathbf{Z}\n\\begin{pmatrix}\n\\cos \\theta&-\\sin \\theta\\\\\n\\sin \\theta&\\cos \\theta\n\\end{pmatrix}.\n\\] Note that the transformation used above is actually \\(\\mathbf{A}^\\top\\) which implies that\n\\[\n\\mathbf{Z} \\mathbf{A}^\\top = \\mathbf{X} \\mathbf{A}\\mathbf{A}^\\top\\ = \\mathbf{X}\n\\]\nand therefore that \\(\\mathbf{A}^\\top\\) is the inverse of \\(\\mathbf{A}\\). This also implies that all the information in \\(\\mathbf{X}\\) is included in the rotation \\(\\mathbf{Z}\\), and it can be retrieved via a linear transformation. A consequence is that for any rotation the distances are preserved. Here is an example for a 30 degree rotation, but it works for any angle:\n\nall.equal(as.matrix(dist(rotate(x, 30))), as.matrix(dist(x)))\n#&gt; [1] TRUE\n\nThe next section explains why this happens."
+    "title": "\n22  Dimension reduction\n",
+    "section": "\n22.3 Linear transformations",
+    "text": "22.3 Linear transformations\nAny time a matrix \\(\\mathbf{X}\\) is multiplied by another matrix \\(\\mathbf{A}\\), we refer to the product \\(\\mathbf{Z} = \\mathbf{X}\\mathbf{A}\\) as a linear transformation of \\(\\mathbf{X}\\). Below, we show that the rotations described above are a linear transformation. To see this, note that for any row \\(i\\), the first entry was:\n\\[\nz_{i,1} = a_{1,1} x_{i,1} + a_{2,1} x_{i,2}\n\\]\nwith \\(a_{1,1} = \\cos\\theta\\) and \\(a_{2,1} = -\\sin\\theta\\).\nThe second entry was also a linear transformation:\n\\[z_{i,2} = a_{1,2} x_{i,1} + a_{2,2} x_{i,2}\\]\nwith \\(a_{1,2} = \\sin\\theta\\) and \\(a_{2,2} = \\cos\\theta\\).\nWe can write these equations using matrix notation:\n\\[\n\\begin{pmatrix}\nz_1\\\\z_2\n\\end{pmatrix}\n=\n\\begin{pmatrix}\na_{1,1}&a_{1,2}\\\\\na_{2,1}&a_{2,2}\n\\end{pmatrix}^\\top\n\\begin{pmatrix}\nx_1\\\\x_2\n\\end{pmatrix}\n\\]\nAn advantage of using linear algebra is that we can write the transformation for the entire dataset by saving all observations in a \\(N \\times 2\\) matrix:\n\\[\n\\mathbf{X} \\equiv\n\\begin{bmatrix}\n\\mathbf{x_1}^\\top\\\\\n\\vdots\\\\\n\\mathbf{x_n}^\\top\n\\end{bmatrix} =\n\\begin{bmatrix}\nx_{1,1}&x_{1,2}\\\\\n\\vdots&\\vdots\\\\\nx_{n,1}&x_{n,2}\n\\end{bmatrix}\n\\]\nWe can then obtain the rotated values \\(\\mathbf{z}_i\\) for each row \\(i\\) by applying a linear transformation of \\(X\\):\n\\[\n\\mathbf{Z} = \\mathbf{X} \\mathbf{A}\n\\mbox{ with }\n\\mathbf{A} = \\,\n\\begin{pmatrix}\na_{1,1}&a_{1,2}\\\\\na_{2,1}&a_{2,2}\n\\end{pmatrix} =\n\\begin{pmatrix}\n\\cos \\theta&\\sin \\theta\\\\\n-\\sin \\theta&\\cos \\theta\n\\end{pmatrix}\n.\n\\]\nIf we define:\n\ntheta &lt;- 2*pi*-45/360 #convert to radians\nA &lt;- matrix(c(cos(theta), -sin(theta), sin(theta), cos(theta)), 2, 2)\n\nWe can write code implementing a rotation by any angle \\(\\theta\\) using linear algebra:\n\nrotate &lt;- function(x, theta){\n  theta &lt;- 2*pi*theta/360\n  A &lt;- matrix(c(cos(theta), -sin(theta), sin(theta), cos(theta)), 2, 2)\n  x %*% A\n}\n\nThe columns of \\(\\mathbf{A}\\) are referred to as directions because if we draw a vector from \\((0,0)\\) to \\((a_{1,j}, a_{2,j})\\), it points in the direction of the line that will become the \\(j-th\\) dimension.\nAnother advantage of linear algebra is that if we can find the inverse matrix of \\(\\mathbf{A}^\\top\\), we can convert \\(\\mathbf{Z}\\) back to \\(\\mathbf{X}\\) again using a linear transformation.\nIn this particular case, we can use trigonometry to show that:\n\\[\nx_{i,1} = b_{1,1} z_{i,1} + b_{2,1} z_{i,2}\\\\\nx_{i,2} = b_{1,2} z_{i,1} + b_{2,2} z_{i,2}\n\\]\nwith \\(b_{2,1} = \\cos\\theta\\), \\(b_{2,1} = \\sin\\theta\\), \\(b_{1,2} = -\\sin\\theta\\), and \\(b_{2,2} = \\cos\\theta\\).\nThis implies that:\n\\[\n\\mathbf{X} = \\mathbf{Z}\n\\begin{pmatrix}\n\\cos \\theta&-\\sin \\theta\\\\\n\\sin \\theta&\\cos \\theta\n\\end{pmatrix}.\n\\] Note that the transformation used above is actually \\(\\mathbf{A}^\\top\\) which implies that\n\\[\n\\mathbf{Z} \\mathbf{A}^\\top = \\mathbf{X} \\mathbf{A}\\mathbf{A}^\\top\\ = \\mathbf{X}\n\\]\nand therefore that \\(\\mathbf{A}^\\top\\) is the inverse of \\(\\mathbf{A}\\). This also implies that all the information in \\(\\mathbf{X}\\) is included in the rotation \\(\\mathbf{Z}\\), and it can be retrieved via a linear transformation. A consequence is that for any rotation the distances are preserved. Here is an example for a 30 degree rotation, although it works for any angle:\n\nall.equal(as.matrix(dist(rotate(x, 30))), as.matrix(dist(x)))\n#&gt; [1] TRUE\n\nThe next section explains why this happens."
   },
   {
     "objectID": "highdim/dimension-reduction.html#orthogonal-transformations",
     "href": "highdim/dimension-reduction.html#orthogonal-transformations",
-    "title": "\n21  Dimension reduction\n",
-    "section": "\n21.4 Orthogonal transformations",
-    "text": "21.4 Orthogonal transformations\nRecall that the distance between two points, say rows \\(h\\) and \\(i\\) of the transformation \\(\\mathbf{Z}\\), can be written like this:\n\\[\n||\\mathbf{z}_h - \\mathbf{z}_i|| = (\\mathbf{z}_h - \\mathbf{z}_i)^\\top(\\mathbf{z}_h - \\mathbf{z}_i)\n\\]\nwith \\(\\mathbf{z}_h\\) and \\(\\mathbf{z}_i\\) the \\(p \\times 1\\) column vectors stored in the \\(h\\)-th and \\(i\\)-th rows of \\(\\mathbf{X}\\), respectively.\n\n\n\n\n\n\nRemember that we represent the rows of a matrix as column vectors. This explains why we use \\(\\mathbf{A}\\) when showing the multiplication for the matrix \\(\\mathbf{Z}=\\mathbf{X}\\mathbf{A}\\) but transpose the operation when showing the transformation for just one observation: \\(\\mathbf{z}_i = \\mathbf{A}^\\top\\mathbf{x}_i\\)\n\n\n\nUsing linear algebra, we can rewrite the quantity above as\n\\[\n||\\mathbf{z}_h - \\mathbf{z}_i|| =\n||\\mathbf{A}^\\top \\mathbf{x}_h - \\mathbf{A}^\\top\\mathbf{x}_i||^2 =\n(\\mathbf{x}_h - \\mathbf{x}_i)^\\top \\mathbf{A} \\mathbf{A}^\\top (\\mathbf{x}_h - \\mathbf{x}_i)\n\\]\nNote that if \\(\\mathbf{A} \\mathbf{A} ^\\top= \\mathbf{I}\\) then the distance between the \\(h\\)th and \\(i\\)th rows is the same for the original and transformed data.\nWe refer to transformation with the property \\(\\mathbf{A} \\mathbf{A}^\\top = \\mathbf{I}\\) as an orthogonal transformations and they are guaranteed to preserves the distance between any two points.\nWe previously demonstrated our rotation has this property. We can confirm using R:\n\nA %*% t(A)\n#&gt;          [,1]     [,2]\n#&gt; [1,] 1.00e+00 1.01e-17\n#&gt; [2,] 1.01e-17 1.00e+00\n\nNotice that \\(\\mathbf{A}\\) being orthogonal also guarantees that the total sum of squares (TSS) of \\(\\mathbf{X}\\), defined as \\(\\sum_{i=1}^n \\sum_{j=1}^p x_{i,j}^2\\) is equal to the total sum of squares of the rotation \\(\\mathbf{Z} = \\mathbf{X}\\mathbf{A}^\\top\\). To show this notice that if we denote the rows of \\(\\mathbf{Z}\\) as \\(\\mathbf{z}_1, \\dots, \\mathbf{z}_n\\), then sum of squares can be written as:\n\\[\n\\sum_{1=1}^n ||\\mathbf{z}_i||^2 = \\sum_{i=1}^n ||\\mathbf{A}^\\top\\mathbf{x}_i||^2 = \\sum_{i=1}^n \\mathbf{x}_i^\\top \\mathbf{A}\\mathbf{A}^\\top  \\mathbf{x}_i = \\sum_{i=1}^n \\mathbf{x}_i^\\top\\mathbf{x}_i = \\sum_{i=1}^n||\\mathbf{x}_i||^2\n\\]\nWe can confirm using R:\n\ntheta &lt;- -45\nz &lt;- rotate(x, theta) # works for any theta\nsum(x^2)\n#&gt; [1] 198\nsum(z^2)\n#&gt; [1] 198\n\nThis can be interpreted as a consequence of the fact that orthogonal transformation guarantee that all the information is preserved.\nHowever, although the total is preserved, the sum of squares for the individual columns changes. Here we compute the proportion of TSS attributed to each column, referred to as the variance explained or variance captured by each column, for \\(\\mathbf{X}\\)\n\ncolSums(x^2)/sum(x^2)\n#&gt; [1] 0.5 0.5\n\nand \\(\\mathbf{Z}\\)\n\ncolSums(z^2)/sum(z^2)\n#&gt; [1] 0.9848 0.0152\n\nIn the next section we describe how this last mathematical result can be useful."
+    "title": "\n22  Dimension reduction\n",
+    "section": "\n22.4 Orthogonal transformations",
+    "text": "22.4 Orthogonal transformations\nRecall that the distance between two points, say rows \\(h\\) and \\(i\\) of the transformation \\(\\mathbf{Z}\\), can be written like this:\n\\[\n||\\mathbf{z}_h - \\mathbf{z}_i|| = (\\mathbf{z}_h - \\mathbf{z}_i)^\\top(\\mathbf{z}_h - \\mathbf{z}_i)\n\\]\nwith \\(\\mathbf{z}_h\\) and \\(\\mathbf{z}_i\\) the \\(p \\times 1\\) column vectors stored in the \\(h\\)-th and \\(i\\)-th rows of \\(\\mathbf{X}\\), respectively.\n\n\n\n\n\n\nRemember that we represent the rows of a matrix as column vectors. This explains why we use \\(\\mathbf{A}\\) when showing the multiplication for the matrix \\(\\mathbf{Z}=\\mathbf{X}\\mathbf{A}\\), but transpose the operation when showing the transformation for just one observation: \\(\\mathbf{z}_i = \\mathbf{A}^\\top\\mathbf{x}_i\\)\n\n\n\nUsing linear algebra, we can rewrite the quantity above as:\n\\[\n||\\mathbf{z}_h - \\mathbf{z}_i|| =\n||\\mathbf{A}^\\top \\mathbf{x}_h - \\mathbf{A}^\\top\\mathbf{x}_i||^2 =\n(\\mathbf{x}_h - \\mathbf{x}_i)^\\top \\mathbf{A} \\mathbf{A}^\\top (\\mathbf{x}_h - \\mathbf{x}_i)\n\\]\nNote that if \\(\\mathbf{A} \\mathbf{A} ^\\top= \\mathbf{I}\\), then the distance between the \\(h\\)th and \\(i\\)th rows is the same for the original and transformed data.\nWe refer to transformation with the property \\(\\mathbf{A} \\mathbf{A}^\\top = \\mathbf{I}\\) as orthogonal transformations. These are guaranteed to preserve the distance between any two points.\nWe previously demonstrated our rotation has this property. We can confirm using R:\n\nA %*% t(A)\n#&gt;          [,1]     [,2]\n#&gt; [1,] 1.00e+00 1.01e-17\n#&gt; [2,] 1.01e-17 1.00e+00\n\nNotice that \\(\\mathbf{A}\\) being orthogonal also guarantees that the total sum of squares (TSS) of \\(\\mathbf{X}\\), defined as \\(\\sum_{i=1}^n \\sum_{j=1}^p x_{i,j}^2\\) is equal to the total sum of squares of the rotation \\(\\mathbf{Z} = \\mathbf{X}\\mathbf{A}^\\top\\). To illustrate, observe that if we denote the rows of \\(\\mathbf{Z}\\) as \\(\\mathbf{z}_1, \\dots, \\mathbf{z}_n\\), then sum of squares can be written as:\n\\[\n\\sum_{1=1}^n ||\\mathbf{z}_i||^2 = \\sum_{i=1}^n ||\\mathbf{A}^\\top\\mathbf{x}_i||^2 = \\sum_{i=1}^n \\mathbf{x}_i^\\top \\mathbf{A}\\mathbf{A}^\\top  \\mathbf{x}_i = \\sum_{i=1}^n \\mathbf{x}_i^\\top\\mathbf{x}_i = \\sum_{i=1}^n||\\mathbf{x}_i||^2\n\\]\nWe can confirm using R:\n\ntheta &lt;- -45\nz &lt;- rotate(x, theta) # works for any theta\nsum(x^2)\n#&gt; [1] 198\nsum(z^2)\n#&gt; [1] 198\n\nThis can be interpreted as a consequence of the fact that an orthogonal transformation guarantees that all the information is preserved.\nHowever, although the total is preserved, the sum of squares for the individual columns changes. Here we compute the proportion of TSS attributed to each column, referred to as the variance explained or variance captured by each column, for \\(\\mathbf{X}\\):\n\ncolSums(x^2)/sum(x^2)\n#&gt; [1] 0.5 0.5\n\nand \\(\\mathbf{Z}\\):\n\ncolSums(z^2)/sum(z^2)\n#&gt; [1] 0.9848 0.0152\n\nIn the next section, we describe how this last mathematical result can be useful."
   },
   {
     "objectID": "highdim/dimension-reduction.html#sec-pca",
     "href": "highdim/dimension-reduction.html#sec-pca",
-    "title": "\n21  Dimension reduction\n",
-    "section": "\n21.5 Principal Component Analysis (PCA)",
-    "text": "21.5 Principal Component Analysis (PCA)\nWe have established that orthogonal transformations preserve the the distance between observations and the total sum of squares. We have also established that, while the TSS remains the same, the way this total is distributed across the columns can change.\nThe general idea behind Principal Component Analysis (PCA) is to try to find orthogonal transformations that concentrate the variance explained in the first few columns. We can then focus on these few columns, effectively reducing the dimension of the problem. In our specific example, we are looking for the rotation that maximizes the variance explained in the first column. The following code performs a grid search across rotations from -90 to 0:\n\n\n\n\n\n\n\n\n\nangles &lt;- seq(0, -90)\nv &lt;- sapply(angles, function(angle) colSums(rotate(x, angle)^2))\nvariance_explained &lt;- v[1,]/sum(x^2)\nplot(angles, variance_explained, type = \"l\")\n\nWe find that a -45 degree rotation appears to achieve the maximum, with over 98% of the total variability explained by the first dimension. We can rotate the entire dataset using:\n\nz &lt;- x %*% A\n\nThe following animation further illustrates how different rotations affect the variability explained by the dimensions of the rotated data:\n\n#&gt; Output at: pca.gif\n\n\n\n\n\n\n\nThe first dimension of z is referred to as the first principal component (PC). Because almost all the variation is explained by this first PC, the distance between rows in x can be very well approximated by the distance calculated with just z[,1].\n\n\n\n\n\n\n\n\nWe also notice that the two groups, adults and children, can be clearly observed with the one number summary, better than with any of the two orginal dimesions.\n\n\n\n\n\n\n\n\n\nhist(x[,1], breaks = seq(-4,4,0.5))\nhist(x[,2], breaks = seq(-4,4,0.5))\nhist(z[,1], breaks = seq(-4,4,0.5))\n\nWe can visualize these to see how the first component summarizes the data. In the plot below red represents high values and blue negative values:\n\n\n\n\n\n\n\n\nThis idea generalizes to dimensions higher than 2. As we did in our two dimensional example, we start by finding the \\(p\\times1\\) vector \\(\\mathbf{a}_1\\) with\\(||\\mathbf{a}_1||=1\\) that maximizes \\(||\\mathbf{X} \\mathbf{a}_1||\\). \\(\\mathbf{X} \\mathbf{a}_1\\) is the first PC. To find the second PC, we subtract the variation explained by first PC from \\(\\mathbf{X}\\):\n\\[\n\\mathbf{r} = \\mathbf{X} - \\mathbf{X} \\mathbf{a}_1 \\mathbf{a}_1^\\top\n\\]\nand then find the vector \\(\\mathbf{a}_2\\) with\\(||\\mathbf{a}_2||=1\\) that maximizes \\(||\\mathbf{r} \\mathbf{a}_2||\\). \\(\\mathbf{X} \\mathbf{a}_2\\) is the second PC. We then subtract the variation explained by the first two PCs, and continue this process until we have the entire rotation matrix and matrix of principal components, respectively:\n\\[\n\\mathbf{A} =\n\\begin{bmatrix}\n\\mathbf{a}_1&\\dots&\\mathbf{a}_p\n\\end{bmatrix},\n\\mathbf{Z} = \\mathbf{X}\\mathbf{A}\n\\]\nThe ideas of distance preervation extends to higher dimensions. For a multidimensional matrix with \\(p\\) columns, the \\(\\mathbf{A}\\) transformation preserves distance between rows, but with the variance exaplined by the columns in decreasing order. If the variances of the columns \\(\\mathbf{Z}_j\\), \\(j&gt;k\\) are very small, these dimensions have little to contribute to the distance calculation and we can approximate distance between any two points with just \\(k\\) dimensions. If \\(k\\) is much smaller than \\(p\\), then we can achieve a very efficient summary of our data.\n\n\n\n\n\n\nNotice that the solution to this maximization problem is not unique because \\(||\\mathbf{X} \\mathbf{a}|| = ||-\\mathbf{X} \\mathbf{a}||\\). Also note that if we multiply a column of \\(\\mathbf{A}\\) by \\(-1\\) we still represent \\(\\mathbf{X}\\) as \\(\\mathbf{Z}\\mathbf{A}^\\top\\) as long as we also multiple the corresponsing column of \\(\\matbf{Z}\\) by -1. This implies that sign of each column of the rotation \\(\\mathbf{A}\\) and principal component matrix \\(\\mathbf{Z}\\) is arbitrary.\n\n\n\nIn R we can find the principal components of any matrix with the function prcomp:\n\npca &lt;- prcomp(x, center = FALSE)\n\nNote that default behavior is to center the columns of x before computing the PCs, an operation we don’t need because our matrix is scaled.\nThe object pca includes the rotated data \\(Z\\) in pca$x and the rotation \\(\\mathbf{A}\\) in pca$rotation.\nWe can see that columns of the pca$rotation are indeed the rotation obtained with -45 (remember the sign is arbitrary)\n\npca$rotation\n#&gt;         PC1    PC2\n#&gt; [1,] -0.707  0.707\n#&gt; [2,] -0.707 -0.707\n\nThe sqaure root of the variation of each column is included in the pca$sdev component. This implies we can compute the variance explained by each PC using:\n\npca$sdev^2/sum(pca$sdev^2)\n#&gt; [1] 0.9848 0.0152\n\nThe function summary performs this calculation for us:\n\nsummary(pca)\n#&gt; Importance of components:\n#&gt;                          PC1    PC2\n#&gt; Standard deviation     1.403 0.1745\n#&gt; Proportion of Variance 0.985 0.0152\n#&gt; Cumulative Proportion  0.985 1.0000\n\nWe also see that we can transform between x (\\(\\mathbf{X}\\)) and pca$x (\\(\\mathbf{Z}\\)) as explained with mathematical formulas above:\n\nall.equal(pca$x, x %*% pca$rotation)\n#&gt; [1] TRUE\nall.equal(x, pca$x %*% t(pca$rotation))\n#&gt; [1] TRUE"
-  },
-  {
-    "objectID": "highdim/dimension-reduction.html#iris-example",
-    "href": "highdim/dimension-reduction.html#iris-example",
-    "title": "\n21  Dimension reduction\n",
-    "section": "\n21.6 Iris example",
-    "text": "21.6 Iris example\nThe iris data is a widely used example in data analysis courses. It includes four botanical measurements related to three flower species:\n\nnames(iris)\n#&gt; [1] \"Sepal.Length\" \"Sepal.Width\"  \"Petal.Length\" \"Petal.Width\" \n#&gt; [5] \"Species\"\n\nIf you print iris$Species you will see that the data is ordered by the species.\nLet’s compute the distance between each observation. You can clearly see the three species with one species very different from the other two:\n\nx &lt;- iris[,1:4] |&gt; as.matrix()\nd &lt;- dist(x)\nimage(as.matrix(d), col = rev(RColorBrewer::brewer.pal(9, \"RdBu\")))\n\n\n\n\n\n\n\n\n\nOur predictors here have four dimensions, but three are very correlated:\n\ncor(x)\n#&gt;              Sepal.Length Sepal.Width Petal.Length Petal.Width\n#&gt; Sepal.Length        1.000      -0.118        0.872       0.818\n#&gt; Sepal.Width        -0.118       1.000       -0.428      -0.366\n#&gt; Petal.Length        0.872      -0.428        1.000       0.963\n#&gt; Petal.Width         0.818      -0.366        0.963       1.000\n\nIf we apply PCA, we should be able to approximate this distance with just two dimensions, compressing the highly correlated dimensions. Using the summary function we can see the variability explained by each PC:\n\npca &lt;- prcomp(x)\nsummary(pca)\n#&gt; Importance of components:\n#&gt;                          PC1    PC2    PC3     PC4\n#&gt; Standard deviation     2.056 0.4926 0.2797 0.15439\n#&gt; Proportion of Variance 0.925 0.0531 0.0171 0.00521\n#&gt; Cumulative Proportion  0.925 0.9777 0.9948 1.00000\n\nThe first two dimensions account for 97% of the variability. Thus we should be able to approximate the distance very well with two dimensions. We can visualize the results of PCA:\n\n\n\n\n\n\n\n\nAnd see that the first pattern is sepal length, petal length, and petal width (red) in one direction and sepal width (blue) in the other. The second pattern is the sepal length and petal width in one direction (blue) and petal length and petal width in the other (red). You can see from the weights that the first PC1 drives most of the variability and it clearly separates the first third of samples (setosa) from the second two thirds (versicolor and virginica). If you look at the second column of the weights, you notice that it somewhat separates versicolor (red) from virginica (blue).\nWe can see this better by plotting the first two PCs with color representing the species:\n\ndata.frame(pca$x[,1:2], Species = iris$Species) |&gt;\n  ggplot(aes(PC1, PC2, fill = Species)) +\n  geom_point(cex = 3, pch = 21) +\n  coord_fixed(ratio = 1)\n\n\n\n\n\n\n\nWe see that the first two dimensions preserve the distance:\n\nd_approx &lt;- dist(pca$x[, 1:2])\nplot(d, d_approx); abline(0, 1, color = \"red\")\n#&gt; Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...):\n#&gt; \"color\" is not a graphical parameter\n\n\n\n\n\n\n\nThis example is more realistic than the first artificial example we used, since we showed how we can visualize the data using two dimensions when the data was four-dimensional."
+    "title": "\n22  Dimension reduction\n",
+    "section": "\n22.5 Principal Component Analysis (PCA)",
+    "text": "22.5 Principal Component Analysis (PCA)\nWe have established that orthogonal transformations preserve the distance between observations and the total sum of squares. We have also established that, while the TSS remains the same, the way this total is distributed across the columns can change.\nThe general idea behind Principal Component Analysis (PCA) is to try to find orthogonal transformations that concentrate the variance explained in the first few columns. We can then focus on these few columns, effectively reducing the dimension of the problem. In our specific example, we are looking for the rotation that maximizes the variance explained in the first column. The following code performs a grid search across rotations from -90 to 0:\n\n\n\n\n\n\n\n\n\nangles &lt;- seq(0, -90)\nv &lt;- sapply(angles, function(angle) colSums(rotate(x, angle)^2))\nvariance_explained &lt;- v[1,]/sum(x^2)\nplot(angles, variance_explained, type = \"l\")\n\nWe find that a -45 degree rotation appears to achieve the maximum, with over 98% of the total variability explained by the first dimension. We denote this rotation matrix with \\(\\mathbf{V}\\):\n\ntheta &lt;- 2*pi*-45/360 #convert to radians\nV &lt;- matrix(c(cos(theta), -sin(theta), sin(theta), cos(theta)), 2, 2)\n\nWe can rotate the entire dataset using:\n\\[\n\\mathbf{Z} = \\mathbf{X}\\mathbf{V}\n\\]\n\nz &lt;- x %*% V\n\nThe following animation further illustrates how different rotations affect the variability explained by the dimensions of the rotated data:\n\n\n\n\n\n\n\n\nThe first dimension of z is referred to as the first principal component (PC). Because almost all the variation is explained by this first PC, the distance between rows in x can be very well approximated by the distance calculated with just z[,1].\n\n\n\n\n\n\n\n\nWe also notice that the two groups, adults and children, can be clearly observed with the one number summary, better than with any of the two original dimensions.\n\n\n\n\n\n\n\n\n\nhist(x[,1], breaks = seq(-4,4,0.5))\nhist(x[,2], breaks = seq(-4,4,0.5))\nhist(z[,1], breaks = seq(-4,4,0.5))\n\nWe can visualize these to see how the first component summarizes the data. In the plot below, red represents high values and blue negative values:\n\n\n\n\n\n\n\n\nThis idea generalizes to dimensions higher than 2. As done in our two dimensional example, we start by finding the \\(p \\times 1\\) vector \\(\\mathbf{v}_1\\) with \\(||\\mathbf{v}_1||=1\\) that maximizes \\(||\\mathbf{X} \\mathbf{v}_1||\\). The projection \\(\\mathbf{X} \\mathbf{v}_1\\) is the first PC. To find the second PC, we subtract the variation explained by first PC from \\(\\mathbf{X}\\):\n\\[\n\\mathbf{r} = \\mathbf{X} - \\mathbf{X} \\mathbf{v}_1 \\mathbf{v}_1^\\top\n\\]\nand then find the vector \\(\\mathbf{v}_2\\) with\\(||\\mathbf{v}_2||=1\\) that maximizes \\(||\\mathbf{r} \\mathbf{v}_2||\\). The projection \\(\\mathbf{X} \\mathbf{v}_2\\) is the second PC. We then subtract the variation explained by the first two PCs, and continue this process until we have the entire rotation matrix and matrix of principal components, respectively:\n\\[\n\\mathbf{V} =\n\\begin{bmatrix}\n\\mathbf{v}_1&\\dots&\\mathbf{v}_p\n\\end{bmatrix},\n\\mathbf{Z} = \\mathbf{X}\\mathbf{V}\n\\]\nThe ideas of distance preservation extends to higher dimensions. For a multidimensional matrix with \\(p\\) columns, the \\(\\mathbf{A}\\) transformation preserves the distance between rows, but with the variance explained by the columns in decreasing order.If the variances of the columns \\(\\mathbf{Z}_j\\), \\(j&gt;k\\) are very small, these dimensions have little to contribute to the distance calculation and we can approximate the distance between any two points with just \\(k\\) dimensions. If \\(k\\) is much smaller than \\(p\\), then we can achieve a very efficient summary of our data.\n\n\n\n\n\n\nNotice that the solution to this maximization problem is not unique because \\(||\\mathbf{X} \\mathbf{v}|| = ||-\\mathbf{X} \\mathbf{v}||\\). Also, note that if we multiply a column of \\(\\mathbf{A}\\) by \\(-1\\), we still represent \\(\\mathbf{X}\\) as \\(\\mathbf{Z}\\mathbf{V}^\\top\\) as long as we also multiple the corresponding column of \\(\\mathbf{V}\\) by -1. This implies that we can arbitrarily change the sign of each column of the rotation \\(\\mathbf{V}\\) and principal component matrix \\(\\mathbf{Z}\\).\n\n\n\nIn R, we can find the principal components of any matrix with the function prcomp:\n\npca &lt;- prcomp(x, center = FALSE)\n\nKeep in mind that default behavior is to center the columns of x before computing the PCs, an operation we don’t need because our matrix is scaled.\nThe object pca includes the rotated data \\(Z\\) in pca$x and the rotation \\(\\mathbf{V}\\) in pca$rotation.\nWe can see that columns of the pca$rotation are indeed the rotation obtained with -45 (remember the sign is arbitrary):\n\npca$rotation\n#&gt;         PC1    PC2\n#&gt; [1,] -0.707  0.707\n#&gt; [2,] -0.707 -0.707\n\nThe square root of the variation of each column is included in the pca$sdev component. This implies we can compute the variance explained by each PC using:\n\npca$sdev^2/sum(pca$sdev^2)\n#&gt; [1] 0.9848 0.0152\n\nThe function summary performs this calculation for us:\n\nsummary(pca)\n#&gt; Importance of components:\n#&gt;                          PC1    PC2\n#&gt; Standard deviation     1.403 0.1745\n#&gt; Proportion of Variance 0.985 0.0152\n#&gt; Cumulative Proportion  0.985 1.0000\n\nWe also see that we can rotate x (\\(\\mathbf{X}\\)) and pca$x (\\(\\mathbf{Z}\\)) as explained with the mathematical formulas above:\n\nall.equal(pca$x, x %*% pca$rotation)\n#&gt; [1] TRUE\nall.equal(x, pca$x %*% t(pca$rotation))\n#&gt; [1] TRUE"
   },
   {
-    "objectID": "highdim/dimension-reduction.html#mnist-example",
-    "href": "highdim/dimension-reduction.html#mnist-example",
-    "title": "\n21  Dimension reduction\n",
-    "section": "\n21.7 MNIST example",
-    "text": "21.7 MNIST example\nThe written digits example has 784 features. Is there any room for data reduction? Can we create simple machine learning algorithms using fewer features?\nLet’s load the data:\n\nlibrary(dslabs)\nif (!exists(\"mnist\")) mnist &lt;- read_mnist()\n\nBecause the pixels are so small, we expect pixels close to each other on the grid to be correlated, meaning that dimension reduction should be possible.\nLet’s try PCA and explore the variance of the PCs. This will take a few seconds as it is a rather large matrix.\n\ncol_means &lt;- colMeans(mnist$test$images)\npca &lt;- prcomp(mnist$train$images)\n\n\npc &lt;- 1:ncol(mnist$test$images)\nqplot(pc, pca$sdev)\n#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.\n\n\n\n\n\n\n\nWe can see that the first few PCs already explain a large percent of the variability:\n\nsummary(pca)$importance[,1:5]\n#&gt;                            PC1     PC2      PC3      PC4      PC5\n#&gt; Standard deviation     576.823 493.238 459.8993 429.8562 408.5668\n#&gt; Proportion of Variance   0.097   0.071   0.0617   0.0539   0.0487\n#&gt; Cumulative Proportion    0.097   0.168   0.2297   0.2836   0.3323\n\nAnd just by looking at the first two PCs we see information about the class. Here is a random sample of 2,000 digits:\n\ndata.frame(PC1 = pca$x[,1], PC2 = pca$x[,2],\n           label = factor(mnist$train$label)) |&gt;\n  sample_n(2000) |&gt;\n  ggplot(aes(PC1, PC2, fill = label)) +\n  geom_point(cex = 3, pch = 21)\n\n\n\n\n\n\n\nWe can also see the linear combinations on the grid to get an idea of what is getting weighted:\n\n\n\n\n\n\n\n\nThe lower variance PCs appear related to unimportant variability in the corners:"
+    "objectID": "highdim/dimension-reduction.html#examples",
+    "href": "highdim/dimension-reduction.html#examples",
+    "title": "\n22  Dimension reduction\n",
+    "section": "\n22.6 Examples",
+    "text": "22.6 Examples\n\n22.6.1 Iris example\nThe iris data is a widely used example in data analysis courses. It includes four botanical measurements related to three flower species:\n\nnames(iris)\n#&gt; [1] \"Sepal.Length\" \"Sepal.Width\"  \"Petal.Length\" \"Petal.Width\" \n#&gt; [5] \"Species\"\n\nIf you print iris$Species, you will see that the data is ordered by the species.\nIf we visualize the distances, we can clearly see the three species with one species very different from the other two:\n\nx &lt;- iris[,1:4] |&gt; as.matrix()\nd &lt;- dist(x)\nimage(as.matrix(d), col = rev(RColorBrewer::brewer.pal(9, \"RdBu\")))\n\n\n\n\n\n\n\n\n\nOur features matrix has four dimensions, but three are very correlated:\n\ncor(x)\n#&gt;              Sepal.Length Sepal.Width Petal.Length Petal.Width\n#&gt; Sepal.Length        1.000      -0.118        0.872       0.818\n#&gt; Sepal.Width        -0.118       1.000       -0.428      -0.366\n#&gt; Petal.Length        0.872      -0.428        1.000       0.963\n#&gt; Petal.Width         0.818      -0.366        0.963       1.000\n\nIf we apply PCA, we should be able to approximate this distance with just two dimensions, compressing the highly correlated dimensions. Using the summary function, we can see the variability explained by each PC:\n\npca &lt;- prcomp(x)\nsummary(pca)\n#&gt; Importance of components:\n#&gt;                          PC1    PC2    PC3     PC4\n#&gt; Standard deviation     2.056 0.4926 0.2797 0.15439\n#&gt; Proportion of Variance 0.925 0.0531 0.0171 0.00521\n#&gt; Cumulative Proportion  0.925 0.9777 0.9948 1.00000\n\nThe first two dimensions account for almost 98% of the variability. Thus, we should be able to approximate the distance very well with two dimensions. We confirm this by computing the distance from first two dimensions and comparing to the original:\n\nd_approx &lt;- dist(pca$x[, 1:2])\nplot(d, d_approx); abline(0, 1, col = \"red\")\n\n\n\n\n\n\n\n\n\nA useful application of this result is that we can now visualize the distance between each observation with a two-dimensional plot.\n\ndata.frame(pca$x[,1:2], Species = iris$Species) |&gt;\n  ggplot(aes(PC1, PC2, fill = Species)) +\n  geom_point(cex = 3, pch = 21) +\n  coord_fixed(ratio = 1)\n\n\n\n\n\n\n\nWe color the observations by their labels and notice that, with these two dimensions, we achieve almost perfect separation.\nLooking more closely at the resulting PCs and rotations:\n\n\n\n\n\n\n\n\nwe learn that the first PC is obtained by taking a weighted average of sepal length, petal length, and petal width (red in first column), and subtracting a a quantity proportional to sepal width (blue in first column). The second PC is a weighted average of petal length and petal width, minus a weighted average of sepal length and petal width.\n\n22.6.2 MNIST example\nThe written digits example has 784 features. Is there any room for data reduction? We will use PCA to answer this.\nIf not already loaded, let’s begin by loading the data:\n\nlibrary(dslabs)\nif (!exists(\"mnist\")) mnist &lt;- read_mnist()\n\nBecause the pixels are so small, we expect pixels close to each other on the grid to be correlated, meaning that dimension reduction should be possible.\nLet’s compute the PCs. This will take a few seconds as it is a rather large matrix:\n\npca &lt;- prcomp(mnist$train$images)\n\n\n\n\n\n\n\n\n\n\nplot(pca$sdev^2/sum(pca$sdev^2), xlab = \"PC\", ylab = \"Variance explained\")\n\nWe can see that the first few PCs already explain a large percent of the variability.\nSimply by looking at the first two PCs we already see information about the labels. Here is a random sample of 500 digits:\n\n\n\n\n\n\n\n\nWe can also see the rotation values on the 28 \\(\\times\\) 28 grid to get an idea of how pixels are being weighted in the transformations that result in the PCs:\n\n\n\n\n\n\n\n\nWe can clearly see that first PC appears to be separating the 1s (red) from the 0s (blue). We can vaguely discern digits, or parts of digits, in the other three PCs as well. By looking at the PCs stratified by digits, we get further insights. For example, we see that the second PC separates 4s, 7s, and 9s from the rest:\n\n\n\n\n\n\n\n\nWe can also confirm that the lower variance PCs appear related to unimportant variability, mainly smudges in the corners:"
   },
   {
     "objectID": "highdim/dimension-reduction.html#exercises",
     "href": "highdim/dimension-reduction.html#exercises",
-    "title": "\n21  Dimension reduction\n",
-    "section": "\n21.7 Exercises",
-    "text": "21.7 Exercises\n1. We want to explore the tissue_gene_expression predictors by plotting them.\n\ndim(tissue_gene_expression$x)\n\nWe want to get an idea of which observations are close to each other, but the predictors are 500-dimensional so plotting is difficult. Plot the first two principal components with color representing tissue type.\n2. The predictors for each observation are measured on the same measurement device (a gene expression microarray) after an experimental procedure. A different device and procedure is used for each observation. This may introduce biases that affect all predictors for each observation in the same way. To explore the effect of this potential bias, for each observation, compute the average across all predictors and then plot this against the first PC with color representing tissue. Report the correlation.\n3. We see an association with the first PC and the observation averages. Redo the PCA but only after removing the center.\n4. For the first 10 PCs, make a boxplot showing the values for each tissue.\n5. Plot the percent variance explained by PC number. Hint: use the summary function."
+    "title": "\n22  Dimension reduction\n",
+    "section": "\n22.7 Exercises",
+    "text": "22.7 Exercises\n1. We want to explore the tissue_gene_expression predictors by plotting them.\n\ndim(tissue_gene_expression$x)\n\nWe hope to get an idea of which observations are close to each other, but the predictors are 500-dimensional so plotting is difficult. Plot the first two principal components with color representing tissue type.\n2. The predictors for each observation are measured on the same measurement device (a gene expression microarray) after an experimental procedure. A different device and procedure is used for each observation. This may introduce biases that affect all predictors for each observation in the same way. To explore the effect of this potential bias, for each observation, compute the average across all predictors and then plot this against the first PC with color representing tissue. Report the correlation.\n3. We see an association with the first PC and the observation averages. Redo the PCA, but only after removing the center.\n4. For the first 10 PCs, make a boxplot showing the values for each tissue.\n5. Plot the percent variance explained by PC number. Hint: Use the summary function."
   },
   {
     "objectID": "highdim/regularization.html#sec-recommendation-systems",
     "href": "highdim/regularization.html#sec-recommendation-systems",
-    "title": "\n22  Regularization\n",
-    "section": "\n22.1 Case study: recommendation systems",
-    "text": "22.1 Case study: recommendation systems\nRecommendation systems use ratings that users have given items to make specific recommendations. Companies that sell many products to many customers and permit these customers to rate their products, like Amazon, are able to collect massive datasets that can be used to predict what rating a particular user will give a specific item. Items for which a high rating is predicted for a given user are then recommended to that user.\nNetflix uses a recommendation system to predict how many stars a user will give a specific movie. One star suggests it is not a good movie, whereas five stars suggests it is an excellent movie. Here, we provide the basics of how these recommendations are made, motivated by some of the approaches taken by the winners of the Netflix challenges.\nIn October 2006, Netflix offered a challenge to the data science community: improve our recommendation algorithm by 10% and win a million dollars. In September 2009, the winners were announced1. You can read a good summary of how the winning algorithm was put together here: http://blog.echen.me/2011/10/24/winning-the-netflix-prize-a-summary/ and a more detailed explanation here: https://www2.seas.gwu.edu/~simhaweb/champalg/cf/papers/KorenBellKor2009.pdf. We will now show you some of the data analysis strategies used by the winning team.\n\n22.1.1 Movielens data\nThe Netflix data is not publicly available, but the GroupLens research lab2 generated their own database with over 20 million ratings for over 27,000 movies by more than 138,000 users. We make a small subset of this data available via the dslabs package:\n\nmovielens |&gt; as_tibble()\n#&gt; # A tibble: 100,004 × 7\n#&gt;   movieId title                      year genres userId rating timestamp\n#&gt;     &lt;int&gt; &lt;chr&gt;                     &lt;int&gt; &lt;fct&gt;   &lt;int&gt;  &lt;dbl&gt;     &lt;int&gt;\n#&gt; 1      31 Dangerous Minds            1995 Drama       1    2.5    1.26e9\n#&gt; 2    1029 Dumbo                      1941 Anima…      1    3      1.26e9\n#&gt; 3    1061 Sleepers                   1996 Thril…      1    3      1.26e9\n#&gt; 4    1129 Escape from New York       1981 Actio…      1    2      1.26e9\n#&gt; 5    1172 Cinema Paradiso (Nuovo c…  1989 Drama       1    4      1.26e9\n#&gt; # ℹ 99,999 more rows\n\nEach row represents a rating given by one user to one movie.\nWe can see the number of unique users that provided ratings and how many unique movies were rated:\n\nmovielens |&gt; \n  summarize(n_users = n_distinct(userId),\n            n_movies = n_distinct(movieId))\n#&gt;   n_users n_movies\n#&gt; 1     671     9066\n\nIf we multiply those two numbers, we get a number larger than 5 million, yet our data table has about 100,000 rows. This implies that not every user rated every movie. So we can think of these data as a very large matrix, with users on the rows and movies on the columns, with many empty cells. The pivot_wider function permits us to convert it to this format, but if we try it for the entire matrix, it will crash R. Let’s show the matrix for six users and four movies.\n\n\n\n\nuserId\nPulp Fiction\nShawshank Redemption\nForrest Gump\nSilence of the Lambs\n\n\n\n13\n3.5\n4.5\n5.0\nNA\n\n\n15\n5.0\n2.0\n1.0\n5.0\n\n\n16\nNA\n4.0\nNA\nNA\n\n\n17\n5.0\n5.0\n2.5\n4.5\n\n\n19\n5.0\n4.0\n5.0\n3.0\n\n\n20\n0.5\n4.5\n2.0\n0.5\n\n\n\n\n\nYou can think of the task of a recommendation system as filling in the NAs in the table above. To see how sparse the matrix is, here is the matrix for a random sample of 100 movies and 100 users with yellow indicating a user/movie combination for which we have a rating.\n\n\n\n\n\n\n\n\nLet’s look at some of the general properties of the data to better understand the challenges.\nThe first thing we notice is that some movies get rated more than others. Below is the distribution. This should not surprise us given that there are blockbuster movies watched by millions and artsy, independent movies watched by just a few. Our second observation is that some users are more active than others at rating movies:\n\n\n\n\n\n\n\n\nWe need to build an algorithm with data we have collected that will then be applied outside our control, as users look for movie recommendations. So let’s create a test set to assess the accuracy of the models we implement. We only consider movies rated five times or more, and users that have rated more than 100 of these movies. We then split the data into a training set and test set by assiging 20% of the ratings made by each user to the test set:\n\nset.seed(2006)\nindexes &lt;- split(1:nrow(movielens), movielens$userId)\ntest_ind &lt;- sapply(indexes, function(ind) sample(ind, ceiling(length(ind)*.2))) |&gt;\n  unlist(use.names = TRUE) |&gt; sort()\ntest_set &lt;- movielens[test_ind,]\ntrain_set &lt;- movielens[-test_ind,]\n\nTo make sure we don’t include movies that are not in both test and train sets, we remove entries using the semi_join function:\n\ntest_set &lt;- test_set |&gt; \n  semi_join(train_set, by = \"movieId\")\ntrain_set &lt;- train_set |&gt; \n  semi_join(test_set, by = \"movieId\")\n\nFinally we use pivot_wider to make a matrix with users represented by rows and movies by the columns\n\ny &lt;- select(train_set, movieId, userId, rating) |&gt;\n  pivot_wider(names_from = movieId, values_from = rating) \nrnames &lt;- y$userId\ny &lt;- as.matrix(y[,-1])\nrownames(y) &lt;- rnames\n\nalong with a table to map movie ids to titles:\n\nmovie_map &lt;- train_set |&gt; select(movieId, title) |&gt; distinct(movieId, .keep_all = TRUE)"
+    "title": "\n23  Regularization\n",
+    "section": "\n23.1 Case study: recommendation systems",
+    "text": "23.1 Case study: recommendation systems\nRecommendation systems, such as the one used by Amazon, operate by analyzing the ratings that customers give to various products. These ratings form a large dataset. The system uses this data to predict how likely a specific user is to favorably rate a particular product. For example, if the system predicts that a user is likely to give a high rating to a certain book or gadget, it will recommend that item to them. In essence, the system tries to guess which products a user will like based on the ratings provided by them and other customers for various items. This approach helps in personalizing recommendations to suit individual preferences.\nDuring its initial years of operation, Netflix used a 5-star recommendation system. One star suggested it was not a good movie, whereas five stars suggested it was an excellent movie. Here, we provide the basics of how these recommendations are made, motivated by some of the approaches taken by the winners of the Netflix challenges.\nIn October 2006, Netflix offered a challenge to the data science community: improve our recommendation algorithm by 10% and win a million dollars. In September 2009, the winners were announced1. You can read a summary of how the winning algorithm was put together here: http://blog.echen.me/2011/10/24/winning-the-netflix-prize-a-summary/ and a more detailed explanation here: https://www2.seas.gwu.edu/~simhaweb/champalg/cf/papers/KorenBellKor2009.pdf. We will now show you some of the data analysis strategies used by the winning team.\n\n23.1.1 Movielens data\nThe Netflix data is not publicly available, but the GroupLens research lab2 generated their own database with over 20 million ratings for over 27,000 movies by more than 138,000 users. We make a small subset of this data available via the dslabs package:\n\nlibrary(tidyverse)\nlibrary(janitor)\nlibrary(dslabs)\nmovielens |&gt; as_tibble() |&gt; head(5)\n#&gt; # A tibble: 5 × 7\n#&gt;   movieId title                      year genres userId rating timestamp\n#&gt;     &lt;int&gt; &lt;chr&gt;                     &lt;int&gt; &lt;fct&gt;   &lt;int&gt;  &lt;dbl&gt;     &lt;int&gt;\n#&gt; 1      31 Dangerous Minds            1995 Drama       1    2.5    1.26e9\n#&gt; 2    1029 Dumbo                      1941 Anima…      1    3      1.26e9\n#&gt; 3    1061 Sleepers                   1996 Thril…      1    3      1.26e9\n#&gt; 4    1129 Escape from New York       1981 Actio…      1    2      1.26e9\n#&gt; 5    1172 Cinema Paradiso (Nuovo c…  1989 Drama       1    4      1.26e9\n\nEach row represents a rating given by one user to one movie.\nIt will later be convenient that our userId and movieId are factors, so we change that:\n\nmovielens &lt;- mutate(movielens, userId = factor(userId), movieId = factor(movieId))\n\nWe can see the number of unique users that provided ratings and how many unique movies were rated:\n\nmovielens |&gt; summarize(n_distinct(userId), n_distinct(movieId))\n#&gt;   n_distinct(userId) n_distinct(movieId)\n#&gt; 1                671                9066\n\nIf we multiply those two numbers, we get a number larger than 5 million, yet our data table has about 100,000 rows. This implies that not every user rated every movie. We can think of these data as a very large matrix, with users on the rows and movies on the columns, with many empty cells. Here is the matrix for six users and four movies:\n\n\n\n\nuserId\nPulp Fiction\nShawshank Redemption\nForrest Gump\nSilence of the Lambs\n\n\n\n13\n3.5\n4.5\n5.0\nNA\n\n\n15\n5.0\n2.0\n1.0\n5.0\n\n\n16\nNA\n4.0\nNA\nNA\n\n\n17\n5.0\n5.0\n2.5\n4.5\n\n\n19\n5.0\n4.0\n5.0\n3.0\n\n\n20\n0.5\n4.5\n2.0\n0.5\n\n\n\n\n\nYou can think of the task of a recommendation system as filling in the NAs in the table above. To see how sparse the matrix is, here is the matrix for a random sample of 100 movies and 100 users with yellow indicating a user/movie combination for which we have a rating:\n\n\n\n\n\n\n\n\nLet’s look at some of the general properties of the data to better understand the challenges.\nThe first thing we notice is that some movies get rated more than others. Below is the distribution. This is not surprising given that there are blockbuster movies watched by millions and artsy, independent movies watched by just a few. Our second observation is that some users are more active than others at rating movies:\n\n\n\n\n\n\n\n\nWe need to build an algorithm with the collected data that will then be applied outside our control when users look for movie recommendations. To test our idea, we will split the data into a training set, which we will use to develop our approach, and a test set in which we will compute the accuracy of our predictions.\nWe will do this only for users that have provided at least 100 ratings.\n\nmovielens &lt;- movielens |&gt; \n  group_by(userId) |&gt;\n  filter(n() &gt;= 100) |&gt;\n  ungroup()\n\nFor each one of these users, we will split their ratings into 80% for training and 20% for testing.\n\nset.seed(2006)\nindexes &lt;- split(1:nrow(movielens), movielens$userId)\ntest_ind &lt;- sapply(indexes, function(i) sample(i, ceiling(length(i)*.2))) |&gt; \n  unlist() |&gt;\n  sort()\ntest_set &lt;- movielens[test_ind,] \ntrain_set &lt;- movielens[-test_ind,]\n\nTo make sure we don’t include movies in the training set that should not be there, we remove entries using the semi_join function:\n\ntest_set &lt;- test_set |&gt; semi_join(train_set, by = \"movieId\")\n\nWe will use the array representation described in Section 17.5, for the training data: we denote ranking for movie \\(j\\) by user \\(i\\) as \\(y_{i,j}\\). To create this matrix, we use pivot_wider:\n\ny &lt;- select(train_set, movieId, userId, rating) |&gt;\n  pivot_wider(names_from = movieId, values_from = rating) |&gt;\n  column_to_rownames(\"userId\") |&gt;\n  as.matrix()\n\nTo be able to map movie IDs to titles we create the following lookup table:\n\nmovie_map &lt;- train_set |&gt; select(movieId, title) |&gt; distinct(movieId, .keep_all = TRUE)\n\nNote that two different movies can have the same title. For example, our dataset has three movies titled “King Kong”. Titles are therefore not unique and we can’t use them as IDs."
   },
   {
     "objectID": "highdim/regularization.html#sec-netflix-loss-function",
     "href": "highdim/regularization.html#sec-netflix-loss-function",
-    "title": "\n22  Regularization\n",
-    "section": "\n22.2 Loss function",
-    "text": "22.2 Loss function\nThe Netflix challenge decided on a winner based on the residual mean squared error (RMSE) on a test set. We define \\(y_{u,i}\\) as the rating for movie \\(i\\) by user \\(u\\) and denote our prediction with \\(\\hat{y}_{u,i}\\). The RMSE is then defined as:\n\\[\n\\mbox{RMSE} = \\sqrt{\\frac{1}{N} \\sum_{u,i}^{} \\left( \\hat{y}_{u,i} - y_{u,i} \\right)^2 }\n\\] with \\(N\\) being the number of user/movie combinations and the sum occurring over all these combinations.\nWe can interpret the RMSE similarly to a standard deviation: it is the typical error we make when predicting a movie rating. If this number is larger than 1, it means our typical error is larger than one star, which is not good. In R, we can define a function to compute this quantity like this:\n\nRMSE &lt;- function(true_ratings, predicted_ratings){\n    sqrt(mean((true_ratings - predicted_ratings)^2))\n  }\n\nIn the next two chapters we introduce two concepts, regularization and matrix factorization, that were used by the winners of the Netflix challenge to obtain lowest RMSE."
+    "title": "\n23  Regularization\n",
+    "section": "\n23.2 Loss function",
+    "text": "23.2 Loss function\nThe Netflix challenge decided on a winner based on the root mean squared error (RMSE) computed on the test set. Specifically, if \\(y_{i,j}\\) is the rating for movie \\(j\\) by user \\(i\\) in the test set and \\(\\hat{y}_{i,j}\\) is our prediction based on the training set, RMSE was defined as:\n\\[\n\\mbox{RMSE} = \\sqrt{\\frac{1}{N} \\sum_{i,j}^{N} (y_{i,j} - \\hat{y}_{i,j})^2}\n\\]\nwith \\(N\\) being the number of user/movie combinations for which we made predictions and the sum occurring over all these combinations.\nWe can interpret the RMSE similarly to a standard deviation: it is the typical error we make when predicting a movie rating. If this number is larger than 1, it means our typical error is larger than one star, which is not good. We define a function to compute this quantity for any set of residuals:\n\nrmse &lt;- function(r) sqrt(mean(r^2))\n\nIn this chapter and the next, we introduce two concepts, regularization and matrix factorization, that were used by the winners of the Netflix challenge to obtain the winning RMSE.\n\n\n\n\n\n\nIn Chapter 29, we provide a formal discussion of the mean squared error."
   },
   {
     "objectID": "highdim/regularization.html#a-first-model",
     "href": "highdim/regularization.html#a-first-model",
-    "title": "\n22  Regularization\n",
-    "section": "\n22.3 A first model",
-    "text": "22.3 A first model\nLet’s start by building the simplest possible recommendation system: we predict the same rating for all movies regardless of user. What number should this prediction be? We can use a model based approach to answer this. A model that assumes the same rating for all movies and users with all the differences explained by random variation would look like this:\n\\[\nY_{u,i} = \\mu + \\varepsilon_{u,i}\n\\]\nwith \\(\\varepsilon_{i,u}\\) independent errors sampled from the same distribution centered at 0 and \\(\\mu\\) the true rating for all movies. We know that the estimate that minimizes the RMSE is the least squares estimate of \\(\\mu\\) and, in this case, is the average of all ratings:\n\nmu &lt;- mean(y, na.rm = TRUE)\nmu\n#&gt; [1] 3.58\n\nIf we predict all unknown ratings with \\(\\hat{\\mu}\\) we obtain the following RMSE:\n\nnaive_rmse &lt;- RMSE(test_set$rating, mu)\nnaive_rmse\n#&gt; [1] 1.05\n\nKeep in mind that if you plug in any other number, you get a higher RMSE. For example:\n\npredictions &lt;- rep(3, nrow(test_set))\nRMSE(test_set$rating, predictions)\n#&gt; [1] 1.19\n\nFrom looking at the distribution of ratings, we can visualize that this is the standard deviation of that distribution. We get a RMSE of about 1. To win the grand prize of $1,000,000, a participating team had to get an RMSE of about 0.857. So we can definitely do better!\nAs we go along, we will be comparing different approaches."
-  },
-  {
-    "objectID": "highdim/regularization.html#modeling-movie-effects",
-    "href": "highdim/regularization.html#modeling-movie-effects",
-    "title": "\n22  Regularization\n",
-    "section": "\n22.4 Modeling movie effects",
-    "text": "22.4 Modeling movie effects\nWe know from experience that some movies are just generally rated higher than others. This intuition, that different movies are rated differently, is confirmed by data. We can use a linear models with a treatment effect \\(b_i\\) for each movie, which can be interpreted as movie effect or the difference between the average ranking for movie \\(i\\) and the overall average \\(\\mu\\):\n\\[\nY_{u,i} = \\mu + b_i + \\varepsilon_{u,i}\n\\]\nStatistics textbooks refer to the \\(b\\)s as treatment effects, however, in the Netflix challenge papers, they refer to them as bias, thus the \\(b\\) notation.\nWe can again use least squares to estimate the \\(b_i\\) in the following way:\n\nfit &lt;- lm(rating ~ as.factor(movieId), data = movielens)\n\nBecause there are thousands of \\(b_i\\) as each movie gets one, the lm() function will be very slow here. We therefore don’t recommend running the code above. But in this particular situation, we know that the least squares estimate \\(\\hat{b}_i\\) is just the average of \\(Y_{u,i} - \\hat{\\mu}\\) for each movie \\(i\\). So we can compute them this way (we will drop the hat notation in the code to represent estimates going forward):\n\nb_i &lt;- colMeans(y - mu, na.rm = TRUE)\n\nWe can see that these estimates vary substantially:\n\nhist(b_i)\n\n\n\n\n\n\n\nRemember \\(\\hat{\\mu}=3.5\\) so a \\(b_i = 1.5\\) implies a perfect five star rating.\nLet’s see how much our prediction improves once we use \\(\\hat{y}_{u,i} = \\hat{\\mu} + \\hat{b}_i\\):\n\nfit_movies &lt;- data.frame(movieId = as.integer(colnames(y)), \n                         mu = mu, b_i = b_i)\nleft_join(test_set, fit_movies, by = \"movieId\") |&gt; \n  mutate(pred = mu + b_i) |&gt; \n  summarize(rmse = RMSE(rating, pred))\n#&gt;    rmse\n#&gt; 1 0.991\n\nWe already see an improvement. But can we make it better?"
+    "title": "\n23  Regularization\n",
+    "section": "\n23.3 A first model",
+    "text": "23.3 A first model\nLet’s start by building the simplest possible recommendation system: we predict the same rating for all movies regardless of user. What number should this prediction be? We can use a model based approach to answer this. A model that assumes the same rating for all movies and users with all the differences explained by random variation would look as follows:\n\\[\nY_{i,j} = \\mu + \\varepsilon_{i,j}\n\\]\nwith \\(\\varepsilon_{i,j}\\) independent errors sampled from the same distribution centered at 0 and \\(\\mu\\) the true rating for all movies. We know that the estimate that minimizes the RMSE is the least squares estimate of \\(\\mu\\) and, in this case, is the average of all ratings:\n\nmu &lt;- mean(y, na.rm = TRUE)\n\nIf we predict all unknown ratings with \\(\\hat{\\mu}\\), we obtain the following RMSE:\n\nrmse(test_set$rating - mu)\n#&gt; [1] 1.04\n\nKeep in mind that if you plug in any other number, you get a higher RMSE. For example:\n\nrmse(test_set$rating - 3)\n#&gt; [1] 1.16\n\nTo win the grand prize of $1,000,000, a participating team had to get an RMSE of about 0.857. So we can definitely do better!"
   },
   {
     "objectID": "highdim/regularization.html#user-effects",
     "href": "highdim/regularization.html#user-effects",
-    "title": "\n22  Regularization\n",
-    "section": "\n22.5 User effects",
-    "text": "22.5 User effects\nLet’s compute the average rating for user \\(u\\) for those that have rated 100 or more movies:\n\nb_u &lt;- rowMeans(y, na.rm = TRUE)\nhist(b_u, nclass = 30)\n\n\n\n\n\n\n\nNotice that there is substantial variability across users as well: some users are very cranky and others love every movie. This implies that a further improvement to our model may be:\n\\[\nY_{u,i} = \\mu + b_i + b_u + \\varepsilon_{u,i}\n\\]\nwhere \\(b_u\\) is a user-specific effect. Now if a cranky user (negative \\(b_u\\)) rates a great movie (positive \\(b_i\\)), the effects counter each other and we may be able to correctly predict that this user gave this great movie a 3 rather than a 5.\nTo fit this model, we could again use lm like this:\n\nlm(rating ~ as.factor(movieId) + as.factor(userId))\n\nbut, for the reasons described earlier, we won’t. Instead, we will compute an approximation by computing \\(\\hat{\\mu}\\) and \\(\\hat{b}_i\\) and estimating \\(\\hat{b}_u\\) as the average of \\(y_{u,i} - \\hat{\\mu} - \\hat{b}_i\\):\n\nb_u &lt;- rowMeans(sweep(y - mu, 2, b_i), na.rm = TRUE)\n\nWe can now construct predictors and see how much the RMSE improves:\n\nfit_users &lt;- data.frame(userId = as.integer(rownames(y)), b_u = b_u)\n\nleft_join(test_set, fit_movies, by = \"movieId\") |&gt; \n  left_join(fit_users, by = \"userId\") |&gt; \n  mutate(pred = mu + b_i + b_u) |&gt; \n  summarize(rmse = RMSE(rating, pred))\n#&gt;   rmse\n#&gt; 1 0.91"
+    "title": "\n23  Regularization\n",
+    "section": "\n23.4 User effects",
+    "text": "23.4 User effects\nIf we visualize the average rating for each user:\n\nhist(rowMeans(y, na.rm = TRUE), nclass = 30)\n\n\n\n\n\n\n\nwe notice that there is substantial variability across users: some users are very cranky and others love most movies. To account for this, we can use a linear model with a treatment effect \\(\\alpha_i\\) for each user. The sum \\(\\mu+\\alpha_i\\) can be interpreted as the typical rating user \\(i\\) gives to movies. We can write the model as:\n\\[\nY_{i,j} = \\mu + \\alpha_i + \\varepsilon_{i,j}\n\\]\nStatistics textbooks refer to the \\(\\alpha\\)s as treatment effects. In the Netflix challenge papers, they refer to them as bias.\nWe can again use least squares to estimate the \\(\\alpha_i\\) in the following way:\n\nfit &lt;- lm(rating ~ userId, data = train_set)\n\nBecause there are hundreds of \\(\\alpha_i\\), as each movie gets one, the lm() function will be very slow here. In this case, we can show that the least squares estimate \\(\\hat{\\alpha}_i\\) is just the average of \\(y_{i,j} - \\hat{\\mu}\\) for each user \\(i\\). So we can compute them this way:\n\na &lt;- rowMeans(y - mu, na.rm = TRUE)\n\nNote that going forward, we drop the hat notation in the code to represent estimates.\nLet’s see how much our prediction improves once we use \\(\\hat{y}_{i,j} = \\hat{\\mu} + \\hat{\\alpha}_i\\). Because we know ratings can’t be below 0.5 or above 5, we define the function clamp:\n\nclamp &lt;- function(x, min = 0.5, max = 5) pmax(pmin(x, max), min)\n\nto keep predictions in that range and then compute the RMSE:\n\ntest_set |&gt; \n  left_join(data.frame(userId = names(a), a = a), by = \"userId\") |&gt;\n  mutate(resid = rating - clamp(mu + a)) |&gt; pull(resid) |&gt; rmse()\n#&gt; [1] 0.958\n\nWe already see an improvement. But can we make it better?"
+  },
+  {
+    "objectID": "highdim/regularization.html#movie-effects",
+    "href": "highdim/regularization.html#movie-effects",
+    "title": "\n23  Regularization\n",
+    "section": "\n23.5 Movie effects",
+    "text": "23.5 Movie effects\nWe know from experience that some movies are just generally rated higher than others. We can use a linear model with a treatment effect \\(\\beta_j\\) for each movie, which can be interpreted as movie effect or the difference between the average ranking for movie \\(j\\) and the overall average \\(\\mu\\):\n\\[\nY_{i,j} = \\mu + \\alpha_i + \\beta_j +\\varepsilon_{i,j}\n\\]\nWe can again use least squares to estimate the \\(b_i\\) in the following way:\n\nfit &lt;- lm(rating ~ userId + movieId, data = train_set)\n\nHowever, this code generates a very large matrix with all the indicator variables needed to represent all the movies and the code will take time to run. We instead use an approximation by first computing the least square estimate \\(\\hat{\\mu}\\) and \\(\\hat{\\alpha}_i\\), and then estimating \\(\\hat{\\beta}_j\\) as the average of the residuals \\(y_{i,j} - \\hat{\\mu} - \\hat{\\alpha}_i\\):\n\nb &lt;- colMeans(y - mu - a, na.rm = TRUE)\n\nWe can now construct predictors and see how much the RMSE improves:\n\ntest_set |&gt; \n  left_join(data.frame(userId = names(a), a = a), by = \"userId\") |&gt;\n  left_join(data.frame(movieId = names(b), b = b), by = \"movieId\") |&gt;\n  mutate(resid = rating - clamp(mu + a + b)) |&gt; pull(resid) |&gt; rmse()\n#&gt; [1] 0.911"
   },
   {
     "objectID": "highdim/regularization.html#penalized-least-squares",
     "href": "highdim/regularization.html#penalized-least-squares",
-    "title": "\n22  Regularization\n",
-    "section": "\n22.6 Penalized least squares",
-    "text": "22.6 Penalized least squares\nLet’s look at the top 3 movies, based on our estimates of the movie effect \\(b_i\\), along with the number of ratings this rating was based on. Several movies get a perfect score. Here are the ones with more than 1 rating:\n\nn &lt;-  colSums(!is.na(y))\nfit_movies$n &lt;- n\nbest &lt;- fit_movies |&gt; left_join(movie_map, by = \"movieId\") |&gt; \n  mutate(average_rating = mu + b_i) |&gt;\n  filter(average_rating == 5 & n &gt; 1) \ntest_set |&gt; \n  group_by(movieId) |&gt;\n  summarize(test_set_averge_rating = mean(rating)) |&gt;\n  right_join(best, by = \"movieId\") |&gt;\n  select(title, average_rating, n, test_set_averge_rating) \n#&gt; # A tibble: 5 × 4\n#&gt;   title                 average_rating     n test_set_averge_rating\n#&gt;   &lt;chr&gt;                          &lt;dbl&gt; &lt;dbl&gt;                  &lt;dbl&gt;\n#&gt; 1 Mother Night                       5     2                    4  \n#&gt; 2 Village of the Damned              5     3                    3.5\n#&gt; 3 Face in the Crowd, A               5     3                    5  \n#&gt; 4 Pawnbroker, The                    5     2                    4  \n#&gt; 5 In a Lonely Place                  5     2                    4.5\n\nThese all seem like obscure movies. Do we really think these are the top 3 movies in our database? Will this prediction hold on the test set? Note that all, except one, are lower in the test, some considerable lower.\nThese supposed best movies were rated by very few users and small sample sizes lead to uncertainty. Therefore, larger estimates of \\(b_i\\), negative or positive, are more likely. Therefore, these are noisy estimates that we should not trust, especially when it comes to prediction. Large errors can increase our RMSE, so we would rather be conservative when unsure.\nIn previous sections, we computed standard error and constructed confidence intervals to account for different levels of uncertainty. However, when making predictions, we need one number, one prediction, not an interval. For this, we introduce the concept of regularization.\nRegularization permits us to penalize large estimates that are formed using small sample sizes. It has commonalities with the Bayesian approach that shrunk predictions described in Section Chapter 11.\nThe general idea behind regularization is to constrain the total variability of the effect sizes. Why does this help? Consider a case in which we have movie \\(i=1\\) with 100 user ratings and 4 movies \\(i=2,3,4,5\\) with just one user rating. We intend to fit the model\n\\[\nY_{u,i} = \\mu + b_i + \\varepsilon_{u,i}\n\\]\nSuppose we know the average rating is, say, \\(\\mu = 3\\). If we use least squares, the estimate for the first movie effect \\(b_1\\) is the average of the 100 user ratings, \\(1/100 \\sum_{i=1}^{100} (Y_{i,1} - \\mu)\\), which we expect to be a quite precise. However, the estimate for movies 2, 3, 4, and 5 will simply be the observed deviation from the average rating \\(\\hat{b}_i = Y_{u,i} - \\hat{\\mu}\\) which is an estimate based on just one number so it won’t be precise at all. Note these estimates make the error \\(Y_{u,i} - \\mu + \\hat{b}_i\\) equal to 0 for \\(i=2,3,4,5\\), but this is a case of over-training. In fact, ignoring the one user and guessing that movies 2,3,4, and 5 are just average movies (\\(b_i = 0\\)) might provide a better prediction. The general idea of penalized regression is to control the total variability of the movie effects: \\(\\sum_{i=1}^5 b_i^2\\). Specifically, instead of minimizing the least squares equation, we minimize an equation that adds a penalty:\n\\[ \\sum_{u,i} \\left(y_{u,i} - \\mu - b_i\\right)^2 + \\lambda \\sum_{i} b_i^2 \\] The first term is just the sum of squares and the second is a penalty that gets larger when many \\(b_i\\) are large. Using calculus we can actually show that the values of \\(b_i\\) that minimize this equation are:\n\\[\n\\hat{b}_i(\\lambda) = \\frac{1}{\\lambda + n_i} \\sum_{u=1}^{n_i} \\left(Y_{u,i} - \\hat{\\mu}\\right)\n\\]\nwhere \\(n_i\\) is the number of ratings made for movie \\(i\\). This approach will have our desired effect: when our sample size \\(n_i\\) is very large, a case which will give us a stable estimate, then the penalty \\(\\lambda\\) is effectively ignored since \\(n_i+\\lambda \\approx n_i\\). However, when the \\(n_i\\) is small, then the estimate \\(\\hat{b}_i(\\lambda)\\) is shrunken towards 0. The larger \\(\\lambda\\), the more we shrink.\nTo select \\(\\lambda\\), we can use cross validation:\n\nlambdas &lt;- seq(0, 10, 0.1)\n\nsums &lt;- colSums(y - mu, na.rm = TRUE)\nrmses &lt;- sapply(lambdas, function(lambda){\n  b_i &lt;-  sums / (n + lambda)\n  fit_movies$b_i &lt;- b_i\n  left_join(test_set, fit_movies, by = \"movieId\") |&gt; mutate(pred = mu + b_i) |&gt; \n    summarize(rmse = RMSE(rating, pred)) |&gt;\n    pull(rmse)\n})\n\nWe can then select the value that minimizes the RMSE:\n\nplot(lambdas, rmses, type = \"l\")\nlambda &lt;- lambdas[which.min(rmses)]\nprint(lambda)\n#&gt; [1] 3.1\n\n\n\n\n\n\n\nOnce we select a \\(\\lambda\\) we can compute the regularized estimates and add to our table of estimates:\n\nfit_movies$b_i_reg &lt;- colSums(y - mu, na.rm = TRUE) / (n + lambda)\n\nTo see how the estimates shrink, let’s make a plot of the regularized estimates versus the least squares estimates.\n\n\n\n\n\n\n\n\nNow, let’s look at the top 5 best movies based on the penalized estimates \\(\\hat{b}_i(\\lambda)\\):\n\n#&gt; # A tibble: 5 × 4\n#&gt;   title                     average_rating     n test_set_averge_rating\n#&gt;   &lt;chr&gt;                              &lt;dbl&gt; &lt;dbl&gt;                  &lt;dbl&gt;\n#&gt; 1 Shawshank Redemption, The           4.49   244                   4.43\n#&gt; 2 Godfather, The                      4.47   163                   4.5 \n#&gt; 3 Thin Man, The                       4.40    10                   3.57\n#&gt; 4 African Queen, The                  4.38    37                   4.35\n#&gt; 5 Roger & Me                          4.37    35                   4.14\n\nThese make much more sense! These movies are watched more and have more ratings.\nDo we improve our results? Let’s estimate the user effects with the new movie effect estimates and compute the new RMSE:\n\nfit_users$b_u &lt;- rowMeans(sweep(y - mu, 2, b_i), na.rm = TRUE)\nleft_join(test_set, fit_movies, by = \"movieId\") |&gt; \n  left_join(fit_users, by = \"userId\") |&gt; \n  mutate(pred = mu + b_i_reg + b_u) |&gt; \n  summarize(rmse = RMSE(rating, pred))\n#&gt;    rmse\n#&gt; 1 0.887\n\nThe penalized estimates provide an improvement over the least squares estimates:\n\n\n\n\nmethod\nRMSE\n\n\n\nJust the average\n1.051\n\n\nMovie Effect Model\n0.991\n\n\nMovie + User Effects Model\n0.910\n\n\nRegularized Movie + User Effect Model\n0.887"
+    "title": "\n23  Regularization\n",
+    "section": "\n23.6 Penalized least squares",
+    "text": "23.6 Penalized least squares\nIf we look at the top movies based on our estimates of the movie effect \\(\\hat{\\beta}_j\\), we find that they all obscure movies with just one rating:\n\nn &lt;- colSums(!is.na(y))\nind &lt;- which(b == max(b))\nfilter(movie_map, movieId %in% names(b)[ind]) |&gt; pull(title)\n#&gt; [1] \"Prisoner of the Mountains (Kavkazsky plennik)\"\n#&gt; [2] \"Dream With the Fishes\"                        \n#&gt; [3] \"Storefront Hitchcock\"                         \n#&gt; [4] \"Anatomy (Anatomie)\"                           \n#&gt; [5] \"Two Ninas\"                                    \n#&gt; [6] \"Erik the Viking\"                              \n#&gt; [7] \"Grass Is Greener, The\"                        \n#&gt; [8] \"Caveman\"\nn[ind]\n#&gt; 1450 1563 1819 3892 4076 4591 4796 5427 \n#&gt;    1    1    1    1    1    1    1    1\n\nDo we really think these are the top movies in our database? The one of these that appears in our test set receives a terrible rating:\n\nfilter(test_set, movieId %in% names(b)[ind]) |&gt; \n  group_by(title, movieId) |&gt;\n  summarize(rating = mean(rating), .groups = \"drop\")\n#&gt; # A tibble: 1 × 3\n#&gt;   title              movieId rating\n#&gt;   &lt;chr&gt;              &lt;fct&gt;    &lt;dbl&gt;\n#&gt; 1 Anatomy (Anatomie) 3892         1\n\nLarge estimates, negative or positive, should not trusted when based on a small number of ratings. Because large errors can increase our RMSE, we would rather be conservative when unsure.\nIn previous sections, we computed standard error and constructed confidence intervals to account for different levels of uncertainty. However, when making predictions, we need one number, one prediction, not an interval. For this, we introduce the concept of regularization.\nRegularization permits us to penalize large estimates that are formed using small sample sizes. It has commonalities with the Bayesian approach that shrunk predictions described in Chapter 12.\nThe general idea behind regularization is to constrain the total variability of the effect sizes. Why does this help? Consider a case in which we have movie \\(j=1\\) with 100 user ratings and 4 movies \\(j=2,3,4,5\\) with just one user rating. Suppose we know the average rating is, say, \\(\\mu = 3\\). If we use least squares, the estimate for the first movie effect is the average of 100 user ratings, which we expect to be quite precise. However, the estimate for movies 2, 3, 4, and 5 will be based on one observation. Note that because the average is based on a single observation, the error for \\(j=2,3,4,5\\) is 0, but we don’t expect to be this lucky next time, when asked to predict. In fact, ignoring the one user and guessing that movies 2,3,4, and 5 are just average movies might provide a better prediction. The general idea of penalized regression is to control the total variability of the movie effects: \\(\\sum_{j=1}^5 \\beta_j^2\\). Specifically, instead of minimizing the least squares equation, we minimize an equation that adds a penalty:\n\\[\n\\sum_{i,j} \\left(y_{u,i} - \\mu - \\alpha_i - \\beta_j \\right)^2 + \\lambda \\sum_{j} \\beta_j^2\n\\] The first term is just the sum of squares and the second is a penalty that gets larger when many \\(\\beta_i\\)s are large. Using calculus, we can actually show that the values of \\(\\beta_i\\) that minimize this equation are:\n\\[\n\\hat{\\beta}_j(\\lambda) = \\frac{1}{\\lambda + n_j} \\sum_{i=1}^{n_i} \\left(Y_{i,j} - \\mu - \\alpha_i\\right)\n\\]\nwhere \\(n_j\\) is the number of ratings made for movie \\(j\\).\n\n\n\n\n\n\nWhen we estimate the parameters of a linear model with penalized least squares, we refer to the approach as ridge regression. The lm.ridge function in the MASS package can perform the estimation. We don’t use it here due to the large numbers of parameters associated with movie effects.\n\n\n\nThis approach will have our desired effect: when our sample size \\(n_j\\) is very large, we obtain a stable estimate and the penalty \\(\\lambda\\) is effectively ignored since \\(n_j+\\lambda \\approx n_j\\). Yet when the \\(n_j\\) is small, then the estimate \\(\\hat{\\beta}_i(\\lambda)\\) is shrunken towards 0. The larger the \\(\\lambda\\), the more we shrink.\nBut how do we select \\(\\lambda\\)? In Chapter 29, we describe an approach to do this. Here we will simply compute the RMSE we for different values of \\(\\lambda\\) to illustrate the effect:\n\nn &lt;- colSums(!is.na(y))\nsums &lt;- colSums(y - mu - a, na.rm = TRUE)\nlambdas &lt;- seq(0, 10, 0.1)\nrmses &lt;- sapply(lambdas, function(lambda){\n  b &lt;-  sums / (n + lambda)\n  test_set |&gt; \n    left_join(data.frame(userId = names(a), a = a), by = \"userId\") |&gt;\n    left_join(data.frame(movieId = names(b), b = b), by = \"movieId\") |&gt;\n    mutate(resid = rating - clamp(mu + a + b)) |&gt; pull(resid) |&gt; rmse()\n})\n\nHere is a plot of the RMSE versus \\(\\lambda\\):\n\nplot(lambdas, rmses, type = \"l\")\n\n\n\n\n\n\n\nThe minimum is obtained for \\(\\lambda=\\) 3.2\nUsing this \\(\\lambda\\), we can compute the regularized estimates and add to our table of estimates:\n\nlambda &lt;- lambdas[which.min(rmses)] \nb_reg &lt;- sums / (n + lambda)\n\nTo see how the estimates shrink, let’s make a plot of the regularized estimates versus the least squares estimates.\n\n\n\n\n\n\n\n\nNow, let’s look at the top 5 best movies based on the penalized estimates \\(\\hat{b}_i(\\lambda)\\):\n\n#&gt; # A tibble: 10 × 5\n#&gt;   title                      year rating b_reg     n\n#&gt;   &lt;chr&gt;                     &lt;int&gt;  &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt;\n#&gt; 1 Shawshank Redemption, The  1994   4.57 0.793   138\n#&gt; 2 Chinatown                  1974   4.5  0.734    51\n#&gt; 3 Dangerous Beauty           1998   4.5  0.764     2\n#&gt; 4 Godfather, The             1972   4.45 0.774   107\n#&gt; 5 Usual Suspects, The        1995   4.40 0.752   109\n#&gt; # ℹ 5 more rows\n\nThese make more sense with some movies that are watched more and have more ratings in the training set.\n\n\n\n\n\n\nNotice Swinger has a lower rating than the other top 10, yet a large movie effect estimate. This is due to the fact that it was rated by harsher users.\n\n\n\nNote that regularization improves our RMSE:\n\ntest_set |&gt; \n  left_join(data.frame(userId = names(a), a = a), by = \"userId\") |&gt;\n  left_join(data.frame(movieId = names(b_reg), b_reg = b_reg), by = \"movieId\") |&gt;\n  mutate(resid = rating - clamp(mu + a + b_reg)) |&gt; pull(resid) |&gt; rmse()\n#&gt; [1] 0.889\n\nThe penalized estimates provide an improvement over the least squares estimates:\n\n\n\n\nmodel\nRMSE\n\n\n\nJust the mean\n1.043\n\n\nUser effect\n0.958\n\n\nUser + movie effect\n0.911\n\n\nUser + regularized movie effect\n0.889"
   },
   {
     "objectID": "highdim/regularization.html#exercises",
     "href": "highdim/regularization.html#exercises",
-    "title": "\n22  Regularization\n",
-    "section": "\n22.7 Exercises",
-    "text": "22.7 Exercises\n1. For the movielens data, compute the number of ratings for each movie and then plot it against the year the movie came out. Use the square root transformation on the counts.\n2. We see that, on average, movies that came out after 1993 get more ratings. We also see that with newer movies, starting in 1993, the number of ratings decreases with year: the more recent a movie is, the less time users have had to rate it.\nAmong movies that came out in 1993 or later, what are the 25 movies with the most ratings per year? Also report their average rating.\n3. From the table constructed in the previous example, we see that the most rated movies tend to have above average ratings. This is not surprising: more people watch popular movies. To confirm this, stratify the post 1993 movies by ratings per year and compute their average ratings. Make a plot of average rating versus ratings per year and show an estimate of the trend.\n4. In the previous exercise, we see that the more a movie is rated, the higher the rating. Suppose you are doing a predictive analysis in which you need to fill in the missing ratings with some value. Which of the following strategies would you use?\n\nFill in the missing values with average rating of all movies.\nFill in the missing values with 0.\nFill in the value with a lower value than the average since lack of rating is associated with lower ratings. Try out different values and evaluate prediction in a test set.\nNone of the above.\n\n5. The movielens dataset also includes a time stamp. This variable represents the time and data in which the rating was provided. The units are seconds since January 1, 1970. Create a new column date with the date. Hint: use the as_datetime function in the lubridate package.\n6. Compute the average rating for each week and plot this average against day. Hint: use the round_date function before you group_by.\n7. The plot shows some evidence of a time effect. If we define \\(d_{u,i}\\) as the day for user’s \\(u\\) rating of movie \\(i\\), which of the following models is most appropriate:\n\n\n\\(Y_{u,i} = \\mu + b_i + b_u + d_{u,i} + \\varepsilon_{u,i}\\).\n\n\\(Y_{u,i} = \\mu + b_i + b_u + d_{u,i}\\beta + \\varepsilon_{u,i}\\).\n\n\\(Y_{u,i} = \\mu + b_i + b_u + d_{u,i}\\beta_i + \\varepsilon_{u,i}\\).\n\n\\(Y_{u,i} = \\mu + b_i + b_u + f(d_{u,i}) + \\varepsilon_{u,i}\\), with \\(f\\) a smooth function of \\(d_{u,i}\\).\n\n8. The movielens data also has a genres column. This column includes every genre that applies to the movie. Some movies fall under several genres. Define a category as whatever combination appears in this column. Keep only categories with more than 1,000 ratings. Then compute the average and standard error for each category. Plot these as error bar plots.\n9. The plot shows strong evidence of a genre effect. If we define \\(g_{u,i}\\) as the genre for user’s \\(u\\) rating of movie \\(i\\), which of the following models is most appropriate:\n\n\n\\(Y_{u,i} = \\mu + b_i + b_u + d_{u,i} + \\varepsilon_{u,i}\\).\n\n\\(Y_{u,i} = \\mu + b_i + b_u + d_{u,i}\\beta + \\varepsilon_{u,i}\\).\n\n\\(Y_{u,i} = \\mu + b_i + b_u + \\sum_{k=1}^K x_{u,i} \\beta_k + \\varepsilon_{u,i}\\), with \\(x^k_{u,i} = 1\\) if \\(g_{u,i}\\) is genre \\(k\\).\n\n\\(Y_{u,i} = \\mu + b_i + b_u + f(d_{u,i}) + \\varepsilon_{u,i}\\), with \\(f\\) a smooth function of \\(d_{u,i}\\).\n\nAn education expert is advocating for smaller schools. The expert bases this recommendation on the fact that among the best performing schools, many are small schools. Let’s simulate a dataset for 100 schools. First, let’s simulate the number of students in each school.\n\nset.seed(1986)\nn &lt;- round(2^rnorm(1000, 8, 1))\n\nNow let’s assign a true quality for each school completely independent from size. This is the parameter we want to estimate.\n\nmu &lt;- round(80 + 2 * rt(1000, 5))\nrange(mu)\nschools &lt;- data.frame(id = paste(\"PS\",1:100), \n                      size = n, \n                      quality = mu,\n                      rank = rank(-mu))\n\nWe can see that the top 10 schools are:\n\nschools |&gt; top_n(10, quality) |&gt; arrange(desc(quality))\n\nNow let’s have the students in the school take a test. There is random variability in test taking so we will simulate the test scores as normally distributed with the average determined by the school quality and standard deviations of 30 percentage points:\n\nscores &lt;- sapply(1:nrow(schools), function(i){\n  scores &lt;- rnorm(schools$size[i], schools$quality[i], 30)\n  scores\n})\nschools &lt;- schools |&gt; mutate(score = sapply(scores, mean))\n\n10. What are the top schools based on the average score? Show just the ID, size, and the average score.\n11. Compare the median school size to the median school size of the top 10 schools based on the score.\n12. According to this test, it appears small schools are better than large schools. Five out of the top 10 schools have 100 or fewer students. But how can this be? We constructed the simulation so that quality and size are independent. Repeat the exercise for the worst 10 schools.\n13. The same is true for the worst schools! They are small as well. Plot the average score versus school size to see what’s going on. Highlight the top 10 schools based on the true quality. Use the log scale transform for the size.\n14. We can see that the standard error of the score has larger variability when the school is smaller. This is a basic statistical reality we learned in the probability and inference sections. In fact, note that 4 of the top 10 schools are in the top 10 schools based on the exam score.\nLet’s use regularization to pick the best schools. Remember regularization shrinks deviations from the average towards 0. So to apply regularization here, we first need to define the overall average for all schools:\n\noverall &lt;- mean(sapply(scores, mean))\n\nand then define, for each school, how it deviates from that average. Write code that estimates the score above average for each school but dividing by \\(n + \\lambda\\) instead of \\(n\\), with \\(n\\) the school size and \\(\\lambda\\) a regularization parameter. Try \\(\\lambda = 3\\).\n15. Notice that this improves things a bit. The number of small schools that are not highly ranked is now 4. Is there a better \\(\\lambda\\)? Find the \\(\\lambda\\) that minimizes the RMSE = \\(1/100 \\sum_{i=1}^{100} (\\mbox{quality} - \\mbox{estimate})^2\\).\n16. Rank the schools based on the average obtained with the best \\(\\alpha\\). Note that no small school is incorrectly included.\n17. A common mistake to make when using regularization is shrinking values towards 0 that are not centered around 0. For example, if we don’t subtract the overall average before shrinking, we actually obtain a very similar result. Confirm this by re-running the code from exercise 6 but without removing the overall mean."
+    "title": "\n23  Regularization\n",
+    "section": "\n23.7 Exercises",
+    "text": "23.7 Exercises\n1. For the movielens data, compute the number of ratings for each movie and then plot it against the year the movie was released. Use the square root transformation on the counts.\n2. We see that, on average, movies that were released after 1993 get more ratings. We also see that with newer movies, starting in 1993, the number of ratings decreases with year: the more recent a movie is, the less time users have had to rate it.\nAmong movies that came out in 1993 or later, what are the 25 movies with the most ratings per year? Also, report their average rating.\n3. From the table constructed in the previous example, we see that the most rated movies tend to have above average ratings. This is not surprising: more people watch popular movies. To confirm this, stratify the post 1993 movies by ratings per year and compute their average ratings. Make a plot of average rating versus ratings per year and show an estimate of the trend.\n4. In the previous exercise, we see that the more a movie is rated, the higher the rating. Suppose you are doing a predictive analysis in which you need to fill in the missing ratings with some value. Which of the following strategies would you use?\n\nFill in the missing values with average rating of all movies.\nFill in the missing values with 0.\nFill in the value with a lower value than the average since lack of rating is associated with lower ratings. Try out different values and evaluate prediction in a test set.\nNone of the above.\n\n5. The movielens dataset also includes a time stamp. This variable represents the time and data in which the rating was provided. The units are seconds since January 1, 1970. Create a new column date with the date. Hint: Use the as_datetime function in the lubridate package.\n6. Compute the average rating for each week and plot this average against day. Hint: Use the round_date function before you group_by.\n7. The plot shows some evidence of a time effect. If we define \\(d_{u,i}\\) as the day for user’s \\(u\\) rating of movie \\(i\\), which of the following models is most appropriate:\n\n\n\\(Y_{u,i} = \\mu + b_i + \\beta_j + d_{u,i} + \\varepsilon_{u,i}\\).\n\n\\(Y_{u,i} = \\mu + b_i + \\beta_j + d_{u,i}\\beta + \\varepsilon_{u,i}\\).\n\n\\(Y_{u,i} = \\mu + b_i + \\beta_j + d_{u,i}\\beta_i + \\varepsilon_{u,i}\\).\n\n\\(Y_{u,i} = \\mu + b_i + \\beta_j + f(d_{u,i}) + \\varepsilon_{u,i}\\), with \\(f\\) a smooth function of \\(d_{u,i}\\).\n\n8. The movielens data also has a genres column. This column includes every genre that applies to the movie. Some movies fall under several genres. Define a category as whatever combination appears in this column. Keep only categories with more than 1,000 ratings. Then compute the average and standard error for each category. Plot these as error bar plots.\n9. The plot shows strong evidence of a genre effect. If we define \\(g_{u,i}\\) as the genre for user’s \\(u\\) rating of movie \\(i\\), which of the following models is most appropriate:\n\n\n\\(Y_{u,i} = \\mu + b_i + \\beta_j + d_{u,i} + \\varepsilon_{u,i}\\).\n\n\\(Y_{u,i} = \\mu + b_i + \\beta_j + d_{u,i}\\beta + \\varepsilon_{u,i}\\).\n\n\\(Y_{u,i} = \\mu + b_i + \\beta_j + \\sum_{k=1}^K x_{u,i} \\beta_k + \\varepsilon_{u,i}\\), with \\(x^k_{u,i} = 1\\) if \\(g_{u,i}\\) is genre \\(k\\).\n\n\\(Y_{u,i} = \\mu + b_i + \\beta_j + f(d_{u,i}) + \\varepsilon_{u,i}\\), with \\(f\\) a smooth function of \\(d_{u,i}\\).\n\nAn education expert is advocating for smaller schools. The expert bases this recommendation on the fact that among the best performing schools, many are small schools. Let’s simulate a dataset for 100 schools. First, let’s simulate the number of students in each school.\n\nset.seed(1986)\nn &lt;- round(2^rnorm(1000, 8, 1))\n\nNow let’s assign a true quality for each school completely independent from size. This is the parameter we want to estimate.\n\nmu &lt;- round(80 + 2 * rt(1000, 5))\nrange(mu)\nschools &lt;- data.frame(id = paste(\"PS\",1:100), \n                      size = n, \n                      quality = mu,\n                      rank = rank(-mu))\n\nWe can see that the top 10 schools are:\n\nschools |&gt; top_n(10, quality) |&gt; arrange(desc(quality))\n\nNow let’s have the students in the school take a test. There is random variability in test taking so we will simulate the test scores as normally distributed with the average determined by the school quality and standard deviations of 30 percentage points:\n\nscores &lt;- sapply(1:nrow(schools), function(i){\n  scores &lt;- rnorm(schools$size[i], schools$quality[i], 30)\n  scores\n})\nschools &lt;- schools |&gt; mutate(score = sapply(scores, mean))\n\n10. What are the top schools based on the average score? Show just the ID, size, and the average score.\n11. Compare the median school size to the median school size of the top 10 schools based on the score.\n12. According to this test, it appears small schools are better than large schools. Five out of the top 10 schools have 100 or fewer students. But how can this be? We constructed the simulation so that quality and size are independent. Repeat the exercise for the worst 10 schools.\n13. The same is true for the worst schools! They are small as well. Plot the average score versus school size to see what’s going on. Highlight the top 10 schools based on the true quality. Use the log scale transform for the size.\n14. We can see that the standard error of the score has larger variability when the school is smaller. This is a basic statistical reality we learned in the probability and inference sections. In fact, note that 4 of the top 10 schools are in the top 10 schools based on the exam score.\nLet’s use regularization to pick the best schools. Remember regularization shrinks deviations from the average towards 0. So to apply regularization here, we first need to define the overall average for all schools:\n\noverall &lt;- mean(sapply(scores, mean))\n\nand then define, for each school, how it deviates from that average. Write code that estimates the score above average for each school, but dividing by \\(n + \\lambda\\) instead of \\(n\\), with \\(n\\) the school size and \\(\\lambda\\) a regularization parameter. Try \\(\\lambda = 3\\).\n15. Notice that this improves things a bit. The number of small schools that are not highly ranked is now 4. Is there a better \\(\\lambda\\)? Find the \\(\\lambda\\) that minimizes the RMSE = \\(1/100 \\sum_{i=1}^{100} (\\mbox{quality} - \\mbox{estimate})^2\\).\n16. Rank the schools based on the average obtained with the best \\(\\alpha\\). Note that no small school is incorrectly included.\n17. A common mistake to make when using regularization is shrinking values towards 0 that are not centered around 0. For example, if we don’t subtract the overall average before shrinking, we actually obtain a very similar result. Confirm this by re-running the code from exercise 6, but without removing the overall mean."
   },
   {
     "objectID": "highdim/regularization.html#footnotes",
     "href": "highdim/regularization.html#footnotes",
-    "title": "\n22  Regularization\n",
+    "title": "\n23  Regularization\n",
     "section": "",
     "text": "http://bits.blogs.nytimes.com/2009/09/21/netflix-awards-1-million-prize-and-starts-a-new-contest/↩︎\nhttps://grouplens.org/↩︎"
   },
   {
     "objectID": "highdim/matrix-factorization.html#sec-factor-analysis",
     "href": "highdim/matrix-factorization.html#sec-factor-analysis",
-    "title": "\n23  Matrix factorization\n",
-    "section": "\n23.1 Factor analysis",
-    "text": "23.1 Factor analysis\nHere is an illustration, using a simulation, of how we can use some structure to predict the \\(r_{u,i}\\). Suppose our residuals r look like this:\n\nround(r, 1)\n#&gt;    Godfather Godfather2 Goodfellas You've Got Sleepless\n#&gt; 1        2.1        2.5        2.4       -1.6      -1.7\n#&gt; 2        1.9        1.4        2.0       -1.8      -1.3\n#&gt; 3        1.8        2.7        2.3       -2.7      -2.0\n#&gt; 4       -0.5        0.7        0.6       -0.8      -0.5\n#&gt; 5       -0.6       -0.8        0.6        0.4       0.6\n#&gt; 6       -0.1        0.2        0.5       -0.7       0.4\n#&gt; 7       -0.3       -0.1       -0.4       -0.4       0.7\n#&gt; 8        0.3        0.4        0.3        0.0       0.7\n#&gt; 9       -1.4       -2.2       -1.5        2.0       2.8\n#&gt; 10      -2.6       -1.5       -1.3        1.6       1.3\n#&gt; 11      -1.5       -2.0       -2.2        1.7       2.7\n#&gt; 12      -1.5       -1.4       -2.3        2.5       2.0\n\nThere seems to be a pattern here. In fact, we can see very strong correlation patterns:\n\ncor(r) \n#&gt;            Godfather Godfather2 Goodfellas You've Got Sleepless\n#&gt; Godfather      1.000      0.923      0.911     -0.898    -0.863\n#&gt; Godfather2     0.923      1.000      0.937     -0.950    -0.969\n#&gt; Goodfellas     0.911      0.937      1.000     -0.949    -0.956\n#&gt; You've Got    -0.898     -0.950     -0.949      1.000     0.945\n#&gt; Sleepless     -0.863     -0.969     -0.956      0.945     1.000\n\nWe can create vectors q and p, that can explain much of the structure we see. The q would look like this:\n\nt(q) \n#&gt;      Godfather Godfather2 Goodfellas You've Got Sleepless\n#&gt; [1,]         1          1          1         -1        -1\n\nand it narrows down movies to two groups: gangster (coded with 1) and romance (coded with -1). We can also reduce the users to three groups:\n\nt(p)\n#&gt;      1 2 3 4 5 6 7 8  9 10 11 12\n#&gt; [1,] 2 2 2 0 0 0 0 0 -2 -2 -2 -2\n\nthose that like gangster movies and dislike romance movies (coded as 2), those that like romance movies and dislike gangster movies (coded as -2), and those that don’t care (coded as 0). The main point here is that we can almost reconstruct \\(r\\), which has 60 values, with a couple of vectors totaling 17 values. Note that p and q are equivalent to the patterns and weights we described in Section Section 21.5.\nIf \\(r\\) contains the residuals for users \\(u=1,\\dots,12\\) for movies \\(i=1,\\dots,5\\) we can write the following mathematical formula for our residuals \\(r_{u,i}\\).\n\\[\nr_{u,i} \\approx p_u q_i\n\\]\nThis implies that we can explain more variability by modifying our previous model for movie recommendations to:\n\\[\nY_{u,i} = \\mu + b_i + b_u + p_u q_i + \\varepsilon_{u,i}\n\\]\nHowever, we motivated the need for the \\(p_u q_i\\) term with a simple simulation. The structure found in data is usually more complex. For example, in this first simulation we assumed there were was just one factor \\(p_u\\) that determined which of the two genres movie \\(u\\) belongs to. But the structure in our movie data seems to be much more complicated than gangster movie versus romance. We may have many other factors. Here we present a slightly more complex simulation. We now add a sixth movie, Scent of Woman.\n\nround(r, 1)\n#&gt;    Godfather Godfather2 Goodfellas You've Got Sleepless Scent\n#&gt; 1        0.0        0.3        2.2        0.2       0.1  -2.3\n#&gt; 2        2.0        1.7        0.0       -1.9      -1.7   0.3\n#&gt; 3        1.9        2.4        0.1       -2.3      -2.0   0.0\n#&gt; 4       -0.3        0.3        0.3       -0.4      -0.3   0.3\n#&gt; 5       -0.3       -0.4        0.3        0.2       0.3  -0.3\n#&gt; 6        0.9        1.1       -0.8       -1.3      -0.8   1.2\n#&gt; 7        0.9        1.0       -1.2       -1.2      -0.7   0.7\n#&gt; 8        1.2        1.2       -0.9       -1.0      -0.6   0.8\n#&gt; 9       -0.7       -1.1       -0.8        1.0       1.4   0.7\n#&gt; 10      -2.3       -1.8        0.3        1.8       1.7  -0.1\n#&gt; 11      -1.7       -2.0       -0.1        1.9       2.3   0.2\n#&gt; 12      -1.8       -1.7       -0.1        2.3       2.0   0.4\n\nBy exploring the correlation structure of this new dataset\n\n#&gt;            Godfather Godfather2 Goodfellas    YGM      SS      SW\n#&gt; Godfather      1.000     0.9760    -0.1748 -0.973 -0.9588  0.1299\n#&gt; Godfather2     0.976     1.0000    -0.1051 -0.986 -0.9903  0.0876\n#&gt; Goodfellas    -0.175    -0.1051     1.0000  0.180  0.0801 -0.9426\n#&gt; YGM           -0.973    -0.9864     0.1799  1.000  0.9868 -0.1632\n#&gt; SS            -0.959    -0.9903     0.0801  0.987  1.0000 -0.0817\n#&gt; SW             0.130     0.0876    -0.9426 -0.163 -0.0817  1.0000\n\nwe note that perhaps we need a second factor to account for the fact that some users like Al Pacino, while others dislike him or don’t care. Notice that the overall structure of the correlation obtained from the simulated data is not that far off the real correlation:\n\n#&gt;            Godfather Godfather2 Goodfellas    YGM       SS     SW\n#&gt; Godfather      1.000    0.82696      0.438 -0.285 -0.10748  0.362\n#&gt; Godfather2     0.827    1.00000      0.574 -0.268 -0.00675  0.340\n#&gt; Goodfellas     0.438    0.57445      1.000 -0.293 -0.27153  0.278\n#&gt; YGM           -0.285   -0.26767     -0.293  1.000  0.53617 -0.289\n#&gt; SS            -0.107   -0.00675     -0.272  0.536  1.00000 -0.307\n#&gt; SW             0.362    0.34008      0.278 -0.289 -0.30732  1.000\n\nTo explain this more complicated structure, we need two factors. For example something like this:\n\nt(q) \n#&gt;      Godfather Godfather2 Goodfellas You've Got Sleepless Scent\n#&gt; [1,]         1          1          1         -1        -1    -1\n#&gt; [2,]         1          1         -1         -1        -1     1\n\nWith the first factor (the first column of q) used to code the gangster versus romance groups and a second factor (the second column of q) to explain the Al Pacino versus no Al Pacino groups. We will also need two sets of coefficients to explain the variability introduced by the \\(3\\times 3\\) types of groups:\n\nt(p)\n#&gt;       1 2 3 4 5 6 7 8  9 10 11 12\n#&gt; [1,]  1 1 1 0 0 0 0 0 -1 -1 -1 -1\n#&gt; [2,] -1 1 1 0 0 1 1 1  0 -1 -1 -1\n\nThe model with two factors has 36 parameters that can be used to explain much of the variability in the 72 ratings:\n\\[\nY_{u,i} = \\mu + b_i + b_u + p_{u,1} q_{1,i} + p_{u,2} q_{2,i} + \\varepsilon_{u,i}\n\\]\nNote that in an actual data application, we need to fit this model to data. To explain the complex correlation we observe in real data, we usually permit the entries of \\(p\\) and \\(q\\) to be continuous values, rather than discrete ones as we used in the simulation. For example, rather than dividing movies into gangster or romance, we define a continuum. Also note that this is not a linear model and to fit it we need to use an algorithm other than the one used by lm to find the parameters that minimize the least squares. The winning algorithms for the Netflix challenge fit a model similar to the above and used regularization to penalize for large values of \\(p\\) and \\(q\\), rather than using least squares. Implementing this approach is beyond the scope of this book."
+    "title": "\n24  Matrix Factorization\n",
+    "section": "\n24.1 Factor analysis",
+    "text": "24.1 Factor analysis\nWe start with a simple illustration. We simulate \\(\\varepsilon_{i,j}\\) for 6 movies and 120 users and save it in e. If we examine the correlation, we notice a pattern:\n\ncor(e)\n#&gt;                      Godfather Godfather 2 Goodfellas Scent of a Woman\n#&gt; Godfather                1.000       0.671      0.558           -0.527\n#&gt; Godfather 2              0.671       1.000      0.471           -0.450\n#&gt; Goodfellas               0.558       0.471      1.000           -0.888\n#&gt; Scent of a Woman        -0.527      -0.450     -0.888            1.000\n#&gt; You've Got Mail         -0.734      -0.649     -0.487            0.451\n#&gt; Sleepless in Seattle    -0.721      -0.739     -0.505            0.475\n#&gt;                      You've Got Mail Sleepless in Seattle\n#&gt; Godfather                     -0.734               -0.721\n#&gt; Godfather 2                   -0.649               -0.739\n#&gt; Goodfellas                    -0.487               -0.505\n#&gt; Scent of a Woman               0.451                0.475\n#&gt; You've Got Mail                1.000                0.756\n#&gt; Sleepless in Seattle           0.756                1.000\n\nIt seems there is positive correlation within mob and romance movies, and negative across the two genres. In statistics, we define factors as unobserved or latent variables that are inferred from the patterns of correlations or associations between the observed variables. We can quantify a factor that distinguishes between mob and romance movies with:\n\nq &lt;- c(-1, -1, -1, 1, 1, 1)\n\nTo determine which users prefer each genre, we can fit a linear model to each user:\n\np &lt;- apply(e, 1, function(y) lm(y~q-1)$coef)\n\nNotice we use the -1 because the errors have mean 0 and we don’t need an intercept.\n\n\n\n\n\n\nThere is a much faster way to make this computation using linear algebra. This is because the lm function is computing the least squares estimates by taking the derivative of the sum of squares, equaling it to 0, and noting the solution \\(\\hat{\\boldsymbol{\\beta}}\\) satisfies:\n\\[\n(\\mathbf{q}^\\top\\mathbf{q}) \\, \\hat{\\boldsymbol{\\beta}}  = \\mathbf{q}^\\top \\mathbf{y}\n\\] with \\(\\mathbf{y}\\) the row of e passed to y in the apply function. Because \\(\\mathbf{q}\\) does not change for each user, rather than have lm recompute the equation for each user, we can perform the calculation on each column of \\(e\\) to get the \\(\\beta_j\\) for all users \\(j\\) like this:\n\np &lt;- t(qr.solve(crossprod(q)) %*% t(q) %*% t(e))\n\n\n\n\nThe histogram below shows there are three type of users: those that love mob movies and hate romance movies, those that don’t care, and those that love romance movies and hate mob movies.\n\nhist(p, breaks = seq(-2,2,0.1))\n\n\n\n\n\n\n\nTo see that we can approximate \\(\\varepsilon_{i,j}\\) with $p_iq_j we convert the vectors to matrices and use linear algebra:\n\np &lt;- matrix(p); q &lt;- matrix(q)\nplot(p %*% t(q), e)\n\n\n\n\n\n\n\nHowever, after removing this mob/romance effect, we still see structure in the correlation:\n\ncor(e - p %*% t(q))\n#&gt;                      Godfather Godfather 2 Goodfellas Scent of a Woman\n#&gt; Godfather                1.000       0.185     -0.545            0.557\n#&gt; Godfather 2              0.185       1.000     -0.618            0.594\n#&gt; Goodfellas              -0.545      -0.618      1.000           -0.671\n#&gt; Scent of a Woman         0.557       0.594     -0.671            1.000\n#&gt; You've Got Mail         -0.280      -0.186      0.619           -0.641\n#&gt; Sleepless in Seattle    -0.198      -0.364      0.650           -0.656\n#&gt;                      You've Got Mail Sleepless in Seattle\n#&gt; Godfather                     -0.280               -0.198\n#&gt; Godfather 2                   -0.186               -0.364\n#&gt; Goodfellas                     0.619                0.650\n#&gt; Scent of a Woman              -0.641               -0.656\n#&gt; You've Got Mail                1.000                0.353\n#&gt; Sleepless in Seattle           0.353                1.000\n\nThis structure seems to be driven by Al Pacino being in the movie or not. This implies we could add another factor:\n\nq &lt;- cbind(c(-1, -1, -1, 1, 1, 1),\n           c(1, 1, -1, 1, -1, 1))\n\nWe can then obtain estimates for each user:\n\np &lt;- t(apply(e, 1, function(y) lm(y~q-1)$coefficient))\n\nNote that we use the transpose t because apply binds results into columns and we want a row for each user.\nOur approximation based on two factors does a even better job of predicting how our residuals deviate from 0:\n\nplot(p %*% t(q), e)\n\n\n\n\n\n\n\nThis analysis provides insights into the process generating our data. Note that it also provides compression: the \\(120 \\times 6\\) matrix \\(\\boldsymbol{\\varepsilon}\\), with 720 observation, is well approximated by a matrix multiplication of a \\(120 \\times 2\\) matrix \\(\\mathbf{P}\\) and a \\(6 \\times 2\\) matrix \\(\\mathbf{Q}\\), a total of 252 parameters.\nOur approximation with two factors can be written as:\n\\[\n\\varepsilon_{i,j} \\approx p_{i,1}q_{j,1} + p_{i,2}q_{j,2} \\mbox{ or } \\boldsymbol{\\varepsilon} = \\mathbf{P}\\mathbf{Q}^\\top\n\\]\nIn our example with simulated data, we deduced the factors \\(\\mathbf{p}_1\\) and \\(\\mathbf{p}_2\\) from the sample correlation and our knowledge of movies. These ended up working well. However, in general deducing factors is not this easy. Furthermore, factors that provide good approximation might be more complicated than containing just two values. For example, The Godfather III might be considered both a mob and romance movie and we would not know what value to assign it in q.\nSo, can we estimate the factors? A challenge is that if \\(\\mathbf{P}\\) is unknown our model is no longer linear: we can use lm to estimate both \\(\\mathbf{P}\\) and \\(\\mathbf{Q}\\). In the next section, we describe a technique that permits us to estimate to this."
+  },
+  {
+    "objectID": "highdim/matrix-factorization.html#connection-to-pca",
+    "href": "highdim/matrix-factorization.html#connection-to-pca",
+    "title": "\n24  Matrix Factorization\n",
+    "section": "\n24.2 Connection to PCA",
+    "text": "24.2 Connection to PCA\nNotice that if we perform PCA on the matrix \\(\\boldsymbol{\\varepsilon}\\), we obtain a transformation \\(\\mathbf{V}\\) that permits us to rewrite:\n\\[\n\\boldsymbol{\\varepsilon} = \\mathbf{Z} \\mathbf{V}^\\top\n\\]\nwith \\(\\mathbf{Z}\\) the matrix of principal components.\nLet’s perform PCA and examine the results:\n\npca &lt;- prcomp(e, center = FALSE)\n\nFirst, notice that the first two PCs explain over 95% of the variability:\n\npca$sdev^2/sum(pca$sdev^2)\n#&gt; [1] 0.6939 0.1790 0.0402 0.0313 0.0303 0.0253\n\nNext, notice that the first column of \\(\\mathbf{V}\\):\n\npca$rotation[,1]\n#&gt;            Godfather          Godfather 2           Goodfellas \n#&gt;                0.306                0.261                0.581 \n#&gt;     Scent of a Woman      You've Got Mail Sleepless in Seattle \n#&gt;               -0.570               -0.294               -0.300\n\nis assigning positive values to the mob movies and negative values to the romance movies.\nThe second column:\n\npca$rotation[,2]\n#&gt;            Godfather          Godfather 2           Goodfellas \n#&gt;               -0.354               -0.377                0.382 \n#&gt;     Scent of a Woman      You've Got Mail Sleepless in Seattle \n#&gt;               -0.437                0.448                0.442\n\nis coding for Al Pacino movies.\nPCA is automatically finding what we deduced with our knowledge of movies. This is not a coincidence.\nAssume that data \\(\\mathbf{Y}\\) follows the model:\n\\[\nY_{i,j} = \\sum_{k=1}^K p_{i,k}q_{j,k} + \\varepsilon_{i,j}\n\\]\nIf we define the matrices \\(\\mathbf{Y}\\) and \\(\\boldsymbol{\\varepsilon}\\) to have \\(y_{i,j}\\) and \\(\\varepsilon_{i,j}\\) in the \\(i\\)th row and \\(j\\)th column, respectively, and \\(\\mathbf{P}\\) and \\(\\mathbf{Q}\\) to have entries \\(p_{i,k}\\) and \\(q_{i,k}\\) in the \\(i\\)th and \\(k\\)th column, respectively, we can rewrite the model as:\n\\[\n\\mathbf{Y} =  \\mathbf{P}\\mathbf{Q} ^\\top + \\boldsymbol{\\varepsilon}\n\\]\nNotice this model is not identifiable since we can multiply the \\(\\mathbf{P}\\) by any positive constant and obtain the same model by dividing \\(\\mathbf{Q}\\) by this same constant. To avoid this, we impose the constraint that \\(\\mathbf{Q}\\) is orthogonal:\n\\[\n\\mathbf{Q}^\\top\\mathbf{Q} = \\mathbf{I}\n\\]\nThe first \\(K\\) columns of the principal components and associated rotation provide estimates of \\(\\mathbf{P}\\) and \\(\\mathbf{Q}\\) respectively."
   },
   {
-    "objectID": "highdim/matrix-factorization.html#connection-to-svd-and-pca",
-    "href": "highdim/matrix-factorization.html#connection-to-svd-and-pca",
-    "title": "\n23  Matrix factorization\n",
-    "section": "\n23.2 Connection to SVD and PCA",
-    "text": "23.2 Connection to SVD and PCA\nThe decomposition:\n\\[\nr_{u,i} \\approx p_{u,1} q_{1,i} + p_{u,2} q_{2,i}\n\\]\nis very much related to SVD and PCA. SVD and PCA are complicated concepts, but one way to understand them is that SVD is an algorithm that finds the vectors \\(p\\) and \\(q\\) that permit us to rewrite the matrix \\(\\mbox{r}\\) with \\(m\\) rows and \\(n\\) columns as:\n\\[\nr_{u,i} = p_{u,1} q_{1,i} + p_{u,2} q_{2,i} + \\dots + p_{u,n} q_{n,i}\n\\]\nwith the variability of each term decreasing and with the \\(p\\)s uncorrelated. The algorithm also computes this variability so that we can know how much of the matrices, total variability is explained as we add new terms. This may permit us to see that, with just a few terms, we can explain most of the variability. To illustrate this we will only consider a small subset of movies with many ratings and users that have rated many movies:\n\nkeep &lt;- c(\"Godfather, The\", \"Godfather: Part II, The\", \"Goodfellas\", \"Ghost\", \"Titanic\", \n          \"Scent of a Woman\")\ndat &lt;- movielens  |&gt; \n  group_by(userId) |&gt;\n  filter(n() &gt;= 250) |&gt; \n  ungroup() |&gt;\n  group_by(movieId) |&gt;\n  filter(n() &gt;= 50 | title %in% keep) |&gt; \n  ungroup() \n\ny &lt;- select(dat, movieId, userId, rating) |&gt;\n  pivot_wider(names_from = movieId, values_from = rating) \ny &lt;- as.matrix(y[,-1])\n\ncolnames(y) &lt;- dat |&gt; select(movieId, title) |&gt; \n  distinct(movieId, .keep_all = TRUE) |&gt;\n  right_join(data.frame(movieId = as.integer(colnames(y))), by = \"movieId\") |&gt;\n  pull(title)\n\nWe first remove the overall movie and user effects as we are interested in the variability not explained by these. We start by removing the movie effects:\n\nr &lt;- sweep(y, 2, colMeans(y, na.rm = TRUE))\n\nBecause for the techniques shown here we can’t have missing values we need to replace the missing ratings. There are advanced techniques for doing this, some are explained in the description of the winning entry for the Netflix competition. Here we will use a simple approach: replace with a constant. Now because an unrated movie is more likely to be a movie the user does not want to see, we will replace the missing ratings with -1 rather than a 0, which represents a neutral rating.\n\nr[is.na(r)] &lt;- -1\n\nFinally we will remove the overall user effect:\n\nr &lt;- r - rowMeans(r)\n\nNow we can perform principal component analysis:\n\npca &lt;- prcomp(r)\n\nThe \\(q\\) vectors are called the principal components and they are stored in this matrix:\n\ndim(pca$rotation)\n#&gt; [1] 138 105\n\nWhile the \\(p\\), or the user effects, are here:\n\ndim(pca$x)\n#&gt; [1] 105 105\n\nWe can see the variability of each of the vectors:\n\nqplot(1:nrow(pca$x), pca$sdev, xlab = \"PC\")\n#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.\n\n\n\n\n\n\n\nWe also notice that the first two principal components are related to the structure in opinions about movies:\n\n\n\n\n\n\n\n\nJust by looking at the top 10 in each direction, we see a meaningful patterns. The first PC shows the difference between Hollywood blockbusters on one side:\n\n#&gt;  [1] \"Independence Day (a.k.a. ID4)\"  \"Armageddon\"                    \n#&gt;  [3] \"Spider-Man\"                     \"Mummy, The\"                    \n#&gt;  [5] \"Aladdin\"                        \"Lion King, The\"                \n#&gt;  [7] \"Harry Potter and the Sorcer...\" \"Twister\"                       \n#&gt;  [9] \"X-Men\"                          \"Lord of the Rings: The Retu...\"\n\nand critically acclaimed movies on the other:\n\n#&gt;  [1] \"2001: A Space Odyssey\"          \"Apocalypse Now\"                \n#&gt;  [3] \"Fargo\"                          \"Being John Malkovich\"          \n#&gt;  [5] \"One Flew Over the Cuckoo's ...\" \"Clockwork Orange, A\"           \n#&gt;  [7] \"Blade Runner\"                   \"Shining, The\"                  \n#&gt;  [9] \"Godfather, The\"                 \"Big Lebowski, The\"\n\nWhile the second PC seems to be related to nerd favorites or violent movies on one side\n\n#&gt;  [1] \"Fight Club\"                     \"Lord of the Rings: The Two ...\"\n#&gt;  [3] \"Lord of the Rings: The Retu...\" \"Matrix, The\"                   \n#&gt;  [5] \"X-Men\"                          \"Lord of the Rings: The Fell...\"\n#&gt;  [7] \"Kill Bill: Vol. 2\"              \"Léon: The Professional (a.k...\"\n#&gt;  [9] \"Kill Bill: Vol. 1\"              \"Memento\"\n\nand romantic movies on the other:\n\n#&gt;  [1] \"Babe\"                 \"Grease\"              \n#&gt;  [3] \"Sleepless in Seattle\" \"Beauty and the Beast\"\n#&gt;  [5] \"Ghost\"                \"Jerry Maguire\"       \n#&gt;  [7] \"Pretty Woman\"         \"Titanic\"             \n#&gt;  [9] \"Aladdin\"              \"Big\"\n\nFitting a model that incorporates these estimates is complicated. For those interested in implementing an approach that incorporates these ideas, we recommend trying the recommenderlab package. The details are beyond the scope of this book."
+    "objectID": "highdim/matrix-factorization.html#case-study-movie-recommendations",
+    "href": "highdim/matrix-factorization.html#case-study-movie-recommendations",
+    "title": "\n24  Matrix Factorization\n",
+    "section": "\n24.3 Case study: movie recommendations",
+    "text": "24.3 Case study: movie recommendations\nNote that if we look at the correlation structure of the movies for which we simulated data in the previous sections, we see structure as well:\n\n#&gt;                         Godfather, The Godfather: Part II, The\n#&gt; Godfather, The                   1.000                   0.842\n#&gt; Godfather: Part II, The          0.842                   1.000\n#&gt; Goodfellas                       0.521                   0.507\n#&gt; Scent of a Woman                 0.323                   0.209\n#&gt; You've Got Mail                 -0.405                  -0.213\n#&gt; Sleepless in Seattle            -0.334                  -0.295\n#&gt;                         Goodfellas Scent of a Woman You've Got Mail\n#&gt; Godfather, The              0.5208           0.3231          -0.405\n#&gt; Godfather: Part II, The     0.5065           0.2091          -0.213\n#&gt; Goodfellas                  1.0000          -0.0277          -0.254\n#&gt; Scent of a Woman           -0.0277           1.0000          -0.312\n#&gt; You've Got Mail            -0.2542          -0.3119           1.000\n#&gt; Sleepless in Seattle       -0.4484          -0.4405           0.455\n#&gt;                         Sleepless in Seattle\n#&gt; Godfather, The                        -0.334\n#&gt; Godfather: Part II, The               -0.295\n#&gt; Goodfellas                            -0.448\n#&gt; Scent of a Woman                      -0.441\n#&gt; You've Got Mail                        0.455\n#&gt; Sleepless in Seattle                   1.000\n\nThis implies that we should be able to improve the predictions made in the previous chapter if we use this information. Ratings on The Godfather should inform ratings for the The Godfather II, for example. But what other patterns might be useful for prediction?\nWe will rewrite the model from the previous chapter to include factors to explain similarities between movies:\n\\[\nY_{i,j} = \\mu + \\alpha_i + \\beta_j + \\sum_{k=1}^K p_{i,k}q_{j,k} +\\varepsilon_{i,j}\n\\]\nUnfortunately, we can’t fit this model with prcomp due to the missing values. We introduce the missMDA package that provides an approach to fit such models when matrix entries are missing, a very common occurrence in movie recommendations, through the function imputePCA. Also, because there are small sample sizes for several movie pairs, it is useful to regularize the \\(p\\)s. The imputePCA function also permits regularization.\nWe use the estimates for \\(\\mu\\), the \\(\\alpha\\)s and \\(\\beta\\)s from the previous chapter, and estimate two factors (ncp = 2). We fit the model to movies rated more than 25 times, include Scent of a Woman, which does not meet this criterion, because we previously used it as an example. Finally, we use regularization by setting the parameter coeff.ridge to the same value used to estimate the \\(\\beta\\)s.\n\nlibrary(missMDA)\nind &lt;- colSums(!is.na(y)) &gt;= 25 | colnames(y) == \"3252\"\nimputed &lt;- imputePCA(r[,ind], ncp = 2, coeff.ridge = lambda)\n\nTo see how much we improve our previous prediction, we construct a matrix with the ratings in the test set:\n\ny_test &lt;- select(test_set, movieId, userId, rating) |&gt;\n  pivot_wider(names_from = movieId, values_from = rating) |&gt;\n  column_to_rownames(\"userId\") |&gt;\n  as.matrix()\n\nand construct our predictor obtained with imputePCA. We start by constructing the predictor from the previous chapter:\n\npred &lt;- matrix(0, nrow(y), ncol(y))\nrownames(pred) &lt;- rownames(y); colnames(pred) &lt;- colnames(y)\npred &lt;- clamp(sweep(pred + mu + a, 2, b_reg, FUN = \"+\"))\nrmse(y_test - pred[rownames(y_test), colnames(y_test)])\n#&gt; [1] 0.889\n\nThen we adjust the prediction to include the imputed residuals for the test set:\n\npred[,ind] &lt;- clamp(pred[,ind] + imputed$completeObs)\n\nWe see that our prediction improves:\n\nrmse(y_test - pred[rownames(y_test), colnames(y_test)])\n#&gt; [1] 0.875\n\nWe note that further improvements can be obtained by 1) optimizing the regularization penalty, 2) considering more than 2 factors, and 3) accounting for the fact that a missing rating provides information: people tend to not watch movies they know they won’t like.\n\n24.3.1 Visualizing factors\nWe can compute the first two principal components used for the prediction using prcomp.\n\npca &lt;- prcomp(imputed$completeObs, center = FALSE, rank. = 3)\n\nBy adding the movie names to the rotation matrix:\n\nv &lt;- pca$rotation\nrownames(v) &lt;- with(movie_map, title[match(colnames(r[,ind]), movieId)])\n\nand visually explore the results, we see that the mob movies discussed above are close to each other, as are the romance movies without Al Pacino.\n\n\n\n\n\n\n\n\nBy looking at the highest and lowest values for the first principal component, we see a meaningful pattern. The first PC shows the difference between Hollywood blockbusters on one side:\n\n#&gt;  [1] \"Armageddon\"                    \"Pearl Harbor\"                 \n#&gt;  [3] \"X2: X-Men United\"              \"Dark Knight Rises, The\"       \n#&gt;  [5] \"X-Men\"                         \"Independence Day (a.k.a. ID4)\"\n#&gt;  [7] \"Con Air\"                       \"I, Robot\"                     \n#&gt;  [9] \"World Is Not Enough, The\"      \"Spider-Man 2\"\n\nand critically acclaimed movies on the other:\n\n#&gt;  [1] \"2001: A Space Odyssey\"          \"American Psycho\"               \n#&gt;  [3] \"Royal Tenenbaums, The\"          \"Harold and Maude\"              \n#&gt;  [5] \"Apocalypse Now\"                 \"Fear and Loathing in Las Vegas\"\n#&gt;  [7] \"Mulholland Drive\"               \"Clockwork Orange, A\"           \n#&gt;  [9] \"English Patient, The\"           \"Dr. Strangelove or: How I L...\""
+  },
+  {
+    "objectID": "highdim/matrix-factorization.html#singular-value-decomposition",
+    "href": "highdim/matrix-factorization.html#singular-value-decomposition",
+    "title": "\n24  Matrix Factorization\n",
+    "section": "\n24.4 Singular Value Decomposition",
+    "text": "24.4 Singular Value Decomposition\nThe analysis performed here with PCA is often performed with a related technique called the Singular Value Decomposition (SVD). The SVD theorem states that any \\(N\\times p\\) matrix can be written as:\n\\[\n\\mathbf{Y} = \\mathbf{U}\\mathbf{D}\\mathbf{V}^\\top\n\\] With \\(\\mathbf{U}\\) and orthogonal \\(N\\times p\\) matrix, \\(\\mathbf{V}\\) an orthogonal \\(p \\times p\\) matrix, and \\(\\mathbf{D}\\) a diagonal matrix with \\(d_{1,1} \\geq d_{2,2} \\geq \\dots \\geq d_{p,p}\\). SVD is related to PCA because we can show that \\(\\mathbf{V}\\) is the rotation that gives us principal components. This implies that \\(\\mathbf{U}\\mathbf{D}\\) are the principal components. This also implies that if you square the diagonal entries of \\(\\mathbf{D}\\), you obtain the sum of squares of the principal components since:\n\\[\n\\mathbf{U}^\\top\\mathbf{D}\\mathbf{U} = \\mathbf{D}^2\n\\]\nIn R, we can obtain the SVD using the function svd. To see the connection to PCA, notice that:\n\nx &lt;- matrix(rnorm(1000), 100, 10)\npca &lt;- prcomp(x, center = FALSE)\ns &lt;- svd(x)\n\nall.equal(pca$rotation, s$v, check.attributes = FALSE)\n#&gt; [1] TRUE\nall.equal(pca$sdev^2, s$d^2/(nrow(x) - 1))\n#&gt; [1] TRUE\nall.equal(pca$x, s$u %*% diag(s$d), check.attributes = FALSE)\n#&gt; [1] TRUE\n\nIn the exercises, we show that s$u %*% diag(s$d) can be computed more efficiently as sweep(s$u, 2, s$d, \"*\")."
   },
   {
     "objectID": "highdim/matrix-factorization.html#exercises",
     "href": "highdim/matrix-factorization.html#exercises",
-    "title": "\n23  Matrix factorization\n",
-    "section": "\n23.3 Exercises",
-    "text": "23.3 Exercises\nIn this exercise set, we will be covering a topic useful for understanding matrix factorization: the singular value decomposition (SVD). SVD is a mathematical result that is widely used in machine learning, both in practice and to understand the mathematical properties of some algorithms. This is a rather advanced topic and to complete this exercise set you will have to be familiar with linear algebra concepts such as matrix multiplication, orthogonal matrices, and diagonal matrices.\nThe SVD tells us that we can decompose an \\(N\\times p\\) matrix \\(Y\\) with \\(p &lt; N\\) as\n\\[ Y = U D V^{\\top} \\]\nWith \\(U\\) and \\(V\\) orthogonal of dimensions \\(N\\times p\\) and \\(p\\times p\\), respectively, and \\(D\\) a \\(p \\times p\\) diagonal matrix with the values of the diagonal decreasing:\n\\[d_{1,1} \\geq d_{2,2} \\geq \\dots d_{p,p}.\\]\nIn this exercise, we will see one of the ways that this decomposition can be useful. To do this, we will construct a dataset that represents grade scores for 100 students in 24 different subjects. The overall average has been removed so this data represents the percentage point each student received above or below the average test score. So a 0 represents an average grade (C), a 25 is a high grade (A+), and a -25 represents a low grade (F). You can simulate the data like this:\n\nset.seed(1987)\nn &lt;- 100\nk &lt;- 8\nSigma &lt;- 64  * matrix(c(1, .75, .5, .75, 1, .5, .5, .5, 1), 3, 3) \nm &lt;- MASS::mvrnorm(n, rep(0, 3), Sigma)\nm &lt;- m[order(rowMeans(m), decreasing = TRUE),]\ny &lt;- m %x% matrix(rep(1, k), nrow = 1) +\n  matrix(rnorm(matrix(n * k * 3)), n, k * 3)\ncolnames(y) &lt;- c(paste(rep(\"Math\",k), 1:k, sep=\"_\"),\n                 paste(rep(\"Science\",k), 1:k, sep=\"_\"),\n                 paste(rep(\"Arts\",k), 1:k, sep=\"_\"))\n\nOur goal is to describe the student performances as succinctly as possible. For example, we want to know if these test results are all just random independent numbers. Are all students just about as good? Does being good in one subject imply you will be good in another? How does the SVD help with all this? We will go step by step to show that with just three relatively small pairs of vectors we can explain much of the variability in this \\(100 \\times 24\\) dataset.\nYou can visualize the 24 test scores for the 100 students by plotting an image:\n\nmy_image &lt;- function(x, zlim = range(x), ...){\n  colors = rev(RColorBrewer::brewer.pal(9, \"RdBu\"))\n  cols &lt;- 1:ncol(x)\n  rows &lt;- 1:nrow(x)\n  image(cols, rows, t(x[rev(rows),,drop=FALSE]), xaxt = \"n\", yaxt = \"n\",\n        xlab=\"\", ylab=\"\",  col = colors, zlim = zlim, ...)\n  abline(h=rows + 0.5, v = cols + 0.5)\n  axis(side = 1, cols, colnames(x), las = 2)\n}\n\nmy_image(y)\n\n1. How would you describe the data based on this figure?\n\nThe test scores are all independent of each other.\nThe students that test well are at the top of the image and there seem to be three groupings by subject.\nThe students that are good at math are not good at science.\nThe students that are good at math are not good at humanities.\n\n2. You can examine the correlation between the test scores directly like this:\n\nmy_image(cor(y), zlim = c(-1,1))\nrange(cor(y))\naxis(side = 2, 1:ncol(y), rev(colnames(y)), las = 2)\n\nWhich of the following best describes what you see?\n\nThe test scores are independent.\nMath and science are highly correlated but the humanities are not.\nThere is high correlation between tests in the same subject but no correlation across subjects.\nThere is a correlation among all tests, but higher if the tests are in science and math and even higher within each subject.\n\n3. Remember that orthogonality means that \\(U^{\\top}U\\) and \\(V^{\\top}V\\) are equal to the identity matrix. This implies that we can also rewrite the decomposition as\n\\[ Y V = U D \\mbox{ or } U^{\\top}Y = D V^{\\top}\\]\nWe can think of \\(YV\\) and \\(U^{\\top}V\\) as two transformations of Y that preserve the total variability of \\(Y\\) since \\(U\\) and \\(V\\) are orthogonal.\nUse the function svd to compute the SVD of y. This function will return \\(U\\), \\(V\\) and the diagonal entries of \\(D\\).\n\ns &lt;- svd(y)\nnames(s)\n\nYou can check that the SVD works by typing:\n\ny_svd &lt;- s$u %*% diag(s$d) %*% t(s$v)\nmax(abs(y - y_svd))\n\nCompute the sum of squares of the columns of \\(Y\\) and store them in ss_y. Then compute the sum of squares of columns of the transformed \\(YV\\) and store them in ss_yv. Confirm that sum(ss_y) is equal to sum(ss_yv).\n4. We see that the total sum of squares is preserved. This is because \\(V\\) is orthogonal. Now to start understanding how \\(YV\\) is useful, plot ss_y against the column number and then do the same for ss_yv. What do you observe?\n5. We see that the variability of the columns of \\(YV\\) is decreasing. Furthermore, we see that, relative to the first three, the variability of the columns beyond the third is almost 0. Now notice that we didn’t have to compute ss_yv because we already have the answer. How? Remember that \\(YV = UD\\) and because \\(U\\) is orthogonal, we know that the sum of squares of the columns of \\(UD\\) are the diagonal entries of \\(D\\) squared. Confirm this by plotting the square root of ss_yv versus the diagonal entries of \\(D\\).\n6. From the above we know that the sum of squares of the columns of \\(Y\\) (the total sum of squares) add up to the sum of s$d^2 and that the transformation \\(YV\\) gives us columns with sums of squares equal to s$d^2. Now compute what percent of the total variability is explained by just the first three columns of \\(YV\\).\n7. We see that almost 99% of the variability is explained by the first three columns of \\(YV = UD\\). So we get the sense that we should be able to explain much of the variability and structure we found while exploring the data with a few columns. Before we continue, let’s show a useful computational trick to avoid creating the matrix diag(s$d). To motivate this, we note that if we write \\(U\\) out in its columns \\([U_1, U_2, \\dots, U_p]\\) then \\(UD\\) is equal to\n\\[UD = [U_1 d_{1,1}, U_2 d_{2,2}, \\dots, U_p d_{p,p}]\\]\nUse the sweep function to compute \\(UD\\) without constructing diag(s$d) nor matrix multiplication.\n8. We know that \\(U_1 d_{1,1}\\), the first column of \\(UD\\), has the most variability of all the columns of \\(UD\\). Earlier we saw an image of \\(Y\\):\n\nmy_image(y)\n\nin which we can see that the student to student variability is quite large and that it appears that students that are good in one subject are good in all. This implies that the average (across all subjects) for each student should explain a lot of the variability. Compute the average score for each student and plot it against \\(U_1 d_{1,1}\\) and describe what you find.\n9. We note that the signs in SVD are arbitrary because:\n\\[ U D V^{\\top} = (-U) D (-V)^{\\top} \\]\nWith this in mind we see that the first column of \\(UD\\) is almost identical to the average score for each student except for the sign.\nThis implies that multiplying \\(Y\\) by the first column of \\(V\\) must be performing a similar operation to taking the average. Make an image plot of \\(V\\) and describe the first column relative to others and how this relates to taking an average.\n10. We already saw that we can rewrite \\(UD\\) as\n\\[U_1 d_{1,1} + U_2 d_{2,2} + \\dots + U_p d_{p,p}\\]\nwith \\(U_j\\) the j-th column of \\(U\\). This implies that we can rewrite the entire SVD as:\n\\[Y = U_1 d_{1,1} V_1 ^{\\top} + U_2 d_{2,2} V_2 ^{\\top} + \\dots + U_p d_{p,p} V_p ^{\\top}\\]\nwith \\(V_j\\) the jth column of \\(V\\). Plot \\(U_1\\), then plot \\(V_1^{\\top}\\) using the same range for the y-axis limits, then make an image of \\(U_1 d_{1,1} V_1 ^{\\top}\\) and compare it to the image of \\(Y\\). Hint: use the my_image function defined above and use the drop=FALSE argument to assure the subsets of matrices are matrices.\n11. We see that with just a vector of length 100, a scalar, and a vector of length 24, we actually come close to reconstructing the original \\(100 \\times 24\\) matrix. This is our first matrix factorization:\n\\[ Y \\approx d_{1,1} U_1 V_1^{\\top}\\] We know it explains s$d[1]^2/sum(s$d^2) * 100 percent of the total variability. Our approximation only explains the observation that good students tend to be good in all subjects. But another aspect of the original data that our approximation does not explain was the higher similarity we observed within subjects. We can see this by computing the difference between our approximation and original data and then computing the correlations. You can see this by running this code:\n\nresid &lt;- y - with(s,(u[,1, drop=FALSE]*d[1]) %*% t(v[,1, drop=FALSE]))\nmy_image(cor(resid), zlim = c(-1,1))\naxis(side = 2, 1:ncol(y), rev(colnames(y)), las = 2)\n\nNow that we have removed the overall student effect, the correlation plot reveals that we have not yet explained the within subject correlation nor the fact that math and science are closer to each other than to the arts. So let’s explore the second column of the SVD. Repeat the previous exercise but for the second column: Plot \\(U_2\\), then plot \\(V_2^{\\top}\\) using the same range for the y-axis limits, then make an image of \\(U_2 d_{2,2} V_2 ^{\\top}\\) and compare it to the image of resid.\n12. The second column clearly relates to a student’s difference in ability in math/science versus the arts. We can see this most clearly from the plot of s$v[,2]. Adding the matrix we obtain with these two columns will help with our approximation:\n\\[ Y \\approx d_{1,1} U_1 V_1^{\\top} + d_{2,2} U_2 V_2^{\\top} \\]\nWe know it will explain\n\nsum(s$d[1:2]^2)/sum(s$d^2) * 100\n\npercent of the total variability. We can compute new residuals like this:\n\nresid &lt;- y - with(s,sweep(u[,1:2], 2, d[1:2], FUN=\"*\") %*% t(v[,1:2]))\nmy_image(cor(resid), zlim = c(-1,1))\naxis(side = 2, 1:ncol(y), rev(colnames(y)), las = 2)\n\nand see that the structure that is left is driven by the differences between math and science. Confirm this by plotting \\(U_3\\), then plot \\(V_3^{\\top}\\) using the same range for the y-axis limits, then make an image of \\(U_3 d_{3,3} V_3 ^{\\top}\\) and compare it to the image of resid.\n13. The third column clearly relates to a student’s difference in ability in math and science. We can see this most clearly from the plot of s$v[,3]. Adding the matrix we obtain with these two columns will help with our approximation:\n\\[ Y \\approx d_{1,1} U_1 V_1^{\\top} + d_{2,2} U_2 V_2^{\\top} + d_{3,3} U_3 V_3^{\\top}\\]\nWe know it will explain:\n\nsum(s$d[1:3]^2)/sum(s$d^2) * 100\n\npercent of the total variability. We can compute new residuals like this:\n\nresid &lt;- y - with(s,sweep(u[,1:3], 2, d[1:3], FUN=\"*\") %*% t(v[,1:3]))\nmy_image(cor(resid), zlim = c(-1,1))\naxis(side = 2, 1:ncol(y), rev(colnames(y)), las = 2)\n\nWe no longer see structure in the residuals: they seem to be independent of each other. This implies that we can describe the data with the following model:\n\\[ Y =  d_{1,1} U_1 V_1^{\\top} + d_{2,2} U_2 V_2^{\\top} + d_{3,3} U_3 V_3^{\\top} + \\varepsilon\\]\nwith \\(\\varepsilon\\) a matrix of independent identically distributed errors. This model is useful because we summarize of \\(100 \\times 24\\) observations with \\(3 \\times (100+24+1) = 375\\) numbers. Furthermore, the three components of the model have useful interpretations: 1) the overall ability of a student, 2) the difference in ability between the math/sciences and arts, and 3) the remaining differences between the three subjects. The sizes \\(d_{1,1}, d_{2,2}\\) and \\(d_{3,3}\\) tell us the variability explained by each component. Finally, note that the components \\(d_{j,j} U_j V_j^{\\top}\\) are equivalent to the jth principal component.\nFinish the exercise by plotting an image of \\(Y\\), an image of \\(d_{1,1} U_1 V_1^{\\top} + d_{2,2} U_2 V_2^{\\top} + d_{3,3} U_3 V_3^{\\top}\\) and an image of the residuals, all with the same zlim.\n14. Advanced. The movielens dataset included in the dslabs package is a small subset of a larger dataset with millions of ratings. You can find the entire latest dataset here https://grouplens.org/datasets/movielens/20m/. Create your own recommendation system using all the tools we have shown you."
+    "title": "\n24  Matrix Factorization\n",
+    "section": "\n24.5 Exercises",
+    "text": "24.5 Exercises\nIn this exercise set, we use the singular value decomposition (SVD) to estimate factors in an example related to the first application of factor analysis: finding factors related to student performance in school.\nWe construct a dataset that represents grade scores for 100 students in 24 different subjects. The overall average has been removed so this data represents the percentage points each student received above or below the average test score. So a 0 represents an average grade (C), a 25 is a high grade (A+), and a -25 represents a low grade (F). You can simulate the data like this:\n\nset.seed(1987)\nn &lt;- 100\nk &lt;- 8\nSigma &lt;- 64  * matrix(c(1, .75, .5, .75, 1, .5, .5, .5, 1), 3, 3) \nm &lt;- MASS::mvrnorm(n, rep(0, 3), Sigma)\nm &lt;- m[order(rowMeans(m), decreasing = TRUE),]\ny &lt;- m %x% matrix(rep(1, k), nrow = 1) +\n  matrix(rnorm(matrix(n * k * 3)), n, k * 3)\ncolnames(y) &lt;- c(paste(rep(\"Math\",k), 1:k, sep=\"_\"),\n                 paste(rep(\"Science\",k), 1:k, sep=\"_\"),\n                 paste(rep(\"Arts\",k), 1:k, sep=\"_\"))\n\nOur goal is to describe the student performances as succinctly as possible. For example, we want to know if these test results are all simply random independent numbers. Are all students just about as good? Does being good in one subject imply one will be good in another? How does the SVD help with all this? We will go step by step to show that with just three relatively small pairs of vectors, we can explain much of the variability in this \\(100 \\times 24\\) dataset.\nYou can visualize the 24 test scores for the 100 students by plotting an image:\n\nmy_image &lt;- function(x, zlim = range(x), ...){\n  colors = rev(RColorBrewer::brewer.pal(9, \"RdBu\"))\n  cols &lt;- 1:ncol(x)\n  rows &lt;- 1:nrow(x)\n  image(cols, rows, t(x[rev(rows),,drop=FALSE]), xaxt = \"n\", yaxt = \"n\",\n        xlab=\"\", ylab=\"\",  col = colors, zlim = zlim, ...)\n  abline(h=rows + 0.5, v = cols + 0.5)\n  axis(side = 1, cols, colnames(x), las = 2)\n}\n\nmy_image(y)\n\n1. How would you describe the data based on this figure?\n\nThe test scores are all independent of each other.\nThe students that test well are at the top of the image and there seems to be three groupings by subject.\nThe students that are good at math are not good at science.\nThe students that are good at math are not good at humanities.\n\n2. You can examine the correlation between the test scores directly like this:\n\nmy_image(cor(y), zlim = c(-1,1))\nrange(cor(y))\naxis(side = 2, 1:ncol(y), rev(colnames(y)), las = 2)\n\nWhich of the following best describes what you see?\n\nThe test scores are independent.\nMath and science are highly correlated, but the humanities are not.\nThere is high correlation between tests in the same subject, but no correlation across subjects.\nThere is a correlation among all tests, but higher if the tests are in science and math and even higher within each subject.\n\n3. Remember that orthogonality means that \\(U^{\\top}U\\) and \\(V^{\\top}V\\) are equal to the identity matrix. This implies that we can also rewrite the decomposition as:\n\\[ \\mathbf{Y V} = \\mathbf{U D} \\mbox{ or } \\mathbf{U}^{\\top}\\mathbf{Y} = \\mathbf{D V}^{\\top}\\]\nWe can think of \\(\\mathbf{YV}\\) and \\(\\mathbf{U}^{\\top}\\mathbf{V}\\) as two transformations of \\(\\mathbf{Y}\\) that preserve the total variability.\nUse the function svd to compute the SVD of y. This function will return \\(U\\), \\(V\\) and the diagonal entries of \\(D\\).\n\ns &lt;- svd(y)\nnames(s)\n\nYou can check that the SVD works by typing:\n\ny_svd &lt;- sweep(s$u, d) %*% t(s$v)\nmax(abs(y - y_svd))\n\nCompute the sum of squares of the columns of \\(Y\\) and store them in ss_y. Then compute the sum of squares of columns of the transformed \\(\\mathbf{YV}\\) and store them in ss_yv. Confirm that sum(ss_y) is equal to sum(ss_yv).\n4. We see that the total sum of squares is preserved. This is because \\(\\mathbf{V}\\) is orthogonal. Now to start understanding how \\(\\mathbf{YV}\\) is useful, plot ss_y against the column number and then do the same for ss_yv. What do you observe?\n5. We see that the variability of the columns of \\(\\mathbf{YV}\\) is decreasing. Furthermore, we see that, relative to the first three, the variability of the columns beyond the third is almost 0. Now notice that we didn’t have to compute ss_yv because we already have the answer. How? Remember that \\(\\mathbf{YV} = \\mathbf{UD}\\) and because \\(\\mathbf{U}\\) is orthogonal, we know that the sum of squares of the columns of \\(\\mathbf{UD}\\) are the diagonal entries of \\(\\mathbf{D}\\) squared. Confirm this by plotting the square root of ss_yv versus the diagonal entries of \\(\\mathbf{D}\\).\n6. From the above we know that the sum of squares of the columns of \\(\\mathbf{Y}\\) (the total sum of squares) add up to the sum of s$d^2, and that the transformation \\(\\mathbf{YV}\\) gives us columns with sums of squares equal to s$d^2. Now compute what percent of the total variability is explained by just the first three columns of \\(\\mathbf{YV}\\).\n7. We see that almost 99% of the variability is explained by the first three columns of \\(\\mathbf{YV} = \\mathbf{UD}\\). So we get the sense that we should be able to explain much of the variability and structure we found while exploring the data with a few columns. Before we continue, let’s show a useful computational trick to avoid creating the matrix diag(s$d). To motivate this, we note that if we write \\(\\mathbf{U}\\) out in its columns \\([\\mathbf{u}_1, \\mathbf{u}_2, \\dots, \\mathbf{u}_p]\\), then \\(\\mathbf{UD}\\) is equal to:\n\\[\n\\mathbf{UD} = [\\mathbf{u}_1 d_{1,1}, \\mathbf{u}_2 d_{2,2}, \\dots, \\mathbf{u}_p d_{p,p}]\n\\]\nUse the sweep function to compute \\(UD\\) without constructing diag(s$d) and without using matrix multiplication.\n8. We know that \\(\\mathbf{u}_1 d_{1,1}\\), the first column of \\(\\mathbf{UD}\\), has the most variability of all the columns of \\(\\mathbf{UD}\\). Earlier we saw an image of \\(Y\\):\n\nmy_image(y)\n\nin which we can see that the student to student variability is quite large and that it appears that students that are good in one subject are good in all. This implies that the average (across all subjects) for each student should explain a lot of the variability. Compute the average score for each student and plot it against \\(\\mathbf{u}_1 d_{1,1}\\), and describe what you find.\n9. We note that the signs in SVD are arbitrary because:\n\\[ \\mathbf{U D V}^{\\top} = (-\\mathbf{U}) D (-\\mathbf{V})^{\\top} \\]\nWith this in mind, we see that the first column of \\(\\mathbf{UD}\\) is almost identical to the average score for each student except for the sign.\nThis implies that multiplying \\(\\mathbf{Y}\\) by the first column of \\(\\mathbf{V}\\) must be performing a similar operation to taking the average. Make an image plot of \\(\\mathbf{V}\\) and describe the first column relative to others and how this relates to taking an average.\n10. We already saw that we can rewrite \\(UD\\) as:\n\\[\n\\mathbf{u}_1 d_{1,1} + \\mathbf{u}_2 d_{2,2} + \\dots + \\mathbf{u}_p d_{p,p}\n\\]\nwith \\(\\mathbf{u}_j\\) the j-th column of \\(\\mathbf{U}\\). This implies that we can rewrite the entire SVD as:\n\\[\\mathbf{Y} = \\mathbf{u}_1 d_{1,1} \\mathbf{v}_1 ^{\\top} + \\mathbf{u}_2 d_{2,2} \\mathbf{v}_2 ^{\\top} + \\dots + \\mathbf{u}_p d_{p,p} \\mathbf{v}_p ^{\\top}\\]\nwith \\(\\mathbf{V}_j\\) the jth column of \\(\\mathbf{V}\\). Plot \\(\\mathbf{u}_1\\), then plot \\(\\mathbf{v}_1^{\\top}\\) using the same range for the y-axis limits. Then make an image of \\(\\mathbf{u}_1 d_{1,1} \\mathbf{v}_1 ^{\\top}\\) and compare it to the image of \\(\\mathbf{Y}\\). Hint: Use the my_image function defined above and use the drop=FALSE argument to assure the subsets of matrices are matrices.\n11. We see that with just a vector of length 100, a scalar, and a vector of length 24, we actually come close to reconstructing the original \\(100 \\times 24\\) matrix. This is our first matrix factorization:\n\\[\n\\mathbf{Y} \\approx d_{1,1} \\mathbf{u}_1 \\mathbf{v}_1^{\\top}\n\\] We know it explains s$d[1]^2/sum(s$d^2) * 100 percent of the total variability. Our approximation only explains the observation that good students tend to be good in all subjects. But another aspect of the original data that our approximation does not explain was the higher similarity we observed within subjects. We can see this by computing the difference between our approximation and original data and then computing the correlations. You can see this by running this code:\n\nresid &lt;- y - with(s,(u[,1, drop=FALSE]*d[1]) %*% t(v[,1, drop=FALSE]))\nmy_image(cor(resid), zlim = c(-1,1))\naxis(side = 2, 1:ncol(y), rev(colnames(y)), las = 2)\n\nNow that we have removed the overall student effect, the correlation plot reveals that we have not yet explained the within subject correlation nor the fact that math and science are closer to each other than to the arts. So let’s explore the second column of the SVD. Repeat the previous exercise but for the second column: Plot \\(\\mathbf{u}_2\\), then plot \\(\\mathbf{v}_2^{\\top}\\) using the same range for the y-axis limits, then make an image of \\(\\mathbf{u}_2 d_{2,2} \\mathbf{v}_2 ^{\\top}\\) and compare it to the image of resid.\n12. The second column clearly relates to a student’s difference in ability in math/science versus the arts. We can see this most clearly from the plot of s$v[,2]. Adding the matrix we obtain with these two columns will help with our approximation:\n\\[\n\\mathbf{Y} \\approx d_{1,1} \\mathbf{u}_1 \\mathbf{v}_1^{\\top} + d_{2,2} \\mathbf{u}_2 \\mathbf{v}_2^{\\top}\n\\]\nWe know it will explain:\n\nsum(s$d[1:2]^2)/sum(s$d^2) * 100\n\npercent of the total variability. We can compute new residuals like this:\n\nresid &lt;- y - with(s,sweep(u[,1:2], 2, d[1:2], FUN=\"*\") %*% t(v[,1:2]))\nmy_image(cor(resid), zlim = c(-1,1))\naxis(side = 2, 1:ncol(y), rev(colnames(y)), las = 2)\n\nand see that the structure that is left is driven by the differences between math and science. Confirm this by plotting \\(\\mathbf{u}_3\\), then plot \\(\\mathbf{v}_3^{\\top}\\) using the same range for the y-axis limits, then make an image of \\(\\mathbf{u}_3 d_{3,3} \\mathbf{v}_3 ^{\\top}\\) and compare it to the image of resid.\n13. The third column clearly relates to a student’s difference in ability in math and science. We can see this most clearly from the plot of s$v[,3]. Adding the matrix we obtain with these two columns will help with our approximation:\n\\[\n\\mathbf{Y} \\approx d_{1,1} \\mathbf{u}_1 \\mathbf{v}_1^{\\top} + d_{2,2} \\mathbf{u}_2 \\mathbf{v}_2^{\\top} + d_{3,3} \\mathbf{u}_3 \\mathbf{v}_3^{\\top}\n\\]\nWe know it will explain:\n\nsum(s$d[1:3]^2)/sum(s$d^2) * 100\n\npercent of the total variability. We can compute new residuals like this:\n\nresid &lt;- y - with(s,sweep(u[,1:3], 2, d[1:3], FUN=\"*\") %*% t(v[,1:3]))\nmy_image(cor(resid), zlim = c(-1,1))\naxis(side = 2, 1:ncol(y), rev(colnames(y)), las = 2)\n\nWe no longer see structure in the residuals: they seem to be independent of each other. This implies that we can describe the data with the following model:\n\\[\n\\mathbf{Y} =  \\mathbf{u}_1 \\mathbf{v}_1^{\\top} + d_{2,2} \\mathbf{u}_2 \\mathbf{v}_2^{\\top} + d_{3,3} \\mathbf{u}_3 \\mathbf{v}_3^{\\top} + \\varepsilon\n\\]\nwith \\(\\varepsilon\\) a matrix of independent identically distributed errors. This model is useful because we summarize \\(100 \\times 24\\) observations with \\(3 \\times (100+24+1) = 375\\) numbers. Furthermore, the three components of the model have useful interpretations: 1) the overall ability of a student, 2) the difference in ability between the math/sciences and arts, and 3) the remaining differences between the three subjects. The sizes \\(d_{1,1}, d_{2,2}\\) and \\(d_{3,3}\\) tell us the variability explained by each component. Finally, note that the components \\(d_{j,j} \\mathbf{u}_j \\mathbf{v}_j^{\\top}\\) are equivalent to the jth principal component.\nFinish the exercise by plotting an image of \\(Y\\), an image of \\(d_{1,1} \\mathbf{u}_1 \\mathbf{v}_1^{\\top} + d_{2,2} \\mathbf{u}_2 \\mathbf{v}_2^{\\top} + d_{3,3} \\mathbf{u}_3 \\mathbf{v}_3^{\\top}\\) and an image of the residuals, all with the same zlim."
   },
   {
     "objectID": "ml/intro-ml.html",
     "href": "ml/intro-ml.html",
     "title": "Machine Learning",
     "section": "",
-    "text": "Machine learning has achieved remarkable successes, ranging from the postal service’s handwritten zip code readers to voice recognition systems like Apple’s Siri. These advances also include movie recommendation systems, spam and malware detection, housing price prediction algorithms, and the development of driverless cars. Although Artificial Intelligence (AI) and Machine Learning are terms frequently used interchangeably today, here we distinguish between them. Traditional AI systems, exemplified by chess-playing machines, employed decision-making based on preset rules stemming from theories or fundamental principles. In contrast, machine learning makes decisions using algorithms trained with data. Furthermore, while AI typically refers to tools complete with user interfaces and ready for real-world application, the term Machine Learning is often reserved for the underlying ideas, concepts, and methodologies, regardless of whether a tangible tool has been developed. In this part of the book we focus on these ideas, concepts, and methodologies, but also demonstrate their application to handwritten digits."
+    "text": "Machine learning has achieved remarkable successes in a variety of applications. These range from the postal service’s use of machine learning for reading handwritten zip codes to the development of voice recognition systems like Apple’s Siri. Other significant advances include movie recommendation systems, spam and malware detection, housing price prediction algorithms, and the ongoing development of autonomous vehicles.\nThe field of Artificial Intelligence (AI) has been evolving for several decades. Traditional AI systems, including some chess-playing machines, often relied on decision-making based on preset rules and knowledge representation. However, with the advent of data availability, machine learning has gained prominence. It focuses on decision-making through algorithms trained with data. In recent years, the terms AI and Machine Learning have been used interchangeably in many contexts, though they have distinct meanings. AI broadly refers to systems or applications that exhibit intelligent behavior, encompassing both rule-based approaches and machine learning. Machine Learning specifically involves learning from data to make decisions or predictions.\nIn this part of the book, we will delve into the concepts, ideas, and methodologies of machine learning. We will also demonstrate their practical application, using the example of recognizing handwritten digits, a classic problem that exemplifies the power and utility of machine learning techniques. Machine learning has achieved remarkable successes, ranging from the postal service’s handwritten zip code readers to voice recognition systems like Apple’s Siri. These advances also include movie recommendation systems, spam and malware detection, housing price prediction algorithms, and the development of driverless cars."
   },
   {
     "objectID": "ml/notation-and-terminology.html#terminology",
     "href": "ml/notation-and-terminology.html#terminology",
-    "title": "\n24  Notation and Terminology\n",
-    "section": "\n24.1 Terminology",
-    "text": "24.1 Terminology\nIn machine learning, data comes in the form of the outcome we want to predict and the features that we will use to predict the outcome. We build algorithms that take feature values as input and returns a prediction for the outcome when we don’t know the outcome. The machine learning approach is to train an algorithm using a dataset for which we do know the outcome, and then apply this algorithm in the future to make a prediction when we don’t know the outcome.\nPrediction problems can be divided into categorical and continuous outcomes. For categorical outcomes, \\(Y\\) can be any one of \\(K\\) classes. The number of classes can vary greatly across applications. For example, in the digit reader data, \\(K=10\\) with the classes being the digits 0, 1, 2, 3, 4, 5, 6, 7, 8, and 9. In speech recognition, the outcomes are all possible words or phrases we are trying to detect. Spam detection has two outcomes: spam or not spam. In this book, we denote the \\(K\\) categories with indexes \\(k=1,\\dots,K\\). However, for binary data we will use \\(k=0,1\\) for mathematical conveniences that we demonstrate later."
+    "title": "\n25  Notation and terminology\n",
+    "section": "\n25.1 Terminology",
+    "text": "25.1 Terminology\nIn machine learning, data comes in the form of the outcome we want to predict and the features that we will use to predict the outcome. We build algorithms that take feature values as input and returns a prediction for the outcome when we don’t know the outcome. The machine learning approach is to train an algorithm using a dataset for which we do know the outcome, and then apply this algorithm in the future to make a prediction when we don’t know the outcome.\nPrediction problems can be divided into categorical and continuous outcomes. For categorical outcomes, \\(Y\\) can be any one of \\(K\\) classes. The number of classes can vary greatly across applications. For example, in the digit reader data, \\(K=10\\) with the classes being the digits 0, 1, 2, 3, 4, 5, 6, 7, 8, and 9. In speech recognition, the outcomes are all possible words or phrases we are trying to detect. Spam detection has two outcomes: spam or not spam. In this book, we denote the \\(K\\) categories with indexes \\(k=1,\\dots,K\\). However, for binary data we will use \\(k=0,1\\) for mathematical conveniences that we demonstrate later."
   },
   {
     "objectID": "ml/notation-and-terminology.html#notation",
     "href": "ml/notation-and-terminology.html#notation",
-    "title": "\n24  Notation and Terminology\n",
-    "section": "\n24.2 Notation",
-    "text": "24.2 Notation\nHere we will use \\(Y\\) to denote the outcome and \\(X_1, \\dots, X_p\\) to denote features. Note that features are sometimes referred to as predictors or covariates. We consider all these to be synonyms.\nThe first step in building an algorithm is to understand what are the outcomes and features. In Section Section 19.1 we showed that associated with each digitized image \\(i\\), there is a categorical outcome \\(Y_i\\) and features \\(X_{i,1}, \\dots, X_{i,p}\\), with \\(p=784\\). We use bold face \\(\\mathbf{X}_i = (X_{i,1}, \\dots, X_{i,p})\\) to denote the vector of predictors. Note that we are using the matrix notation described in Section 19.2. When referring to an arbitrary set of features rather than a specific image, we drop the index \\(i\\) and use \\(Y\\) and \\(\\mathbf{X} = (X_{1}, \\dots, X_{p})\\). We use upper case variables because, in general, we think of the outcome and predictors as random variables. We use lower case, for example \\(\\mathbf{X} = \\mathbf{x}\\), to denote observed values. Although, when we code, we stick to lower case.\nThe machine learning task is to build an algorithm that returns a prediction for any of the possible values of the features. Here, we will learn several approaches to building these algorithms. Although at this point it might seem impossible to achieve this, we will start with simple examples and build up our knowledge until we can attack more complex ones. In fact, we start with an artificially simple example with just one predictor and then move on to a slightly more realistic example with two predictors. Once we understand these, we will attack real-world machine learning challenges involving many predictors."
+    "title": "\n25  Notation and terminology\n",
+    "section": "\n25.2 Notation",
+    "text": "25.2 Notation\nHere we will use \\(Y\\) to denote the outcome and \\(X_1, \\dots, X_p\\) to denote features. Note that features are sometimes referred to as predictors or covariates. We consider all these to be synonyms.\nThe first step in building an algorithm is to understand what are the outcomes and features. In Section 20.1, we showed that associated with each digitized image \\(i\\), there is a categorical outcome \\(Y_i\\) and features \\(X_{i,1}, \\dots, X_{i,p}\\), with \\(p=784\\). We use bold face \\(\\mathbf{X}_i = (X_{i,1}, \\dots, X_{i,p})\\) to denote the vector of predictors. Notice that we are using the matrix notation described in Section 20.5. When referring to an arbitrary set of features rather than a specific image, we drop the index \\(i\\) and use \\(Y\\) and \\(\\mathbf{X} = (X_{1}, \\dots, X_{p})\\). We use upper case variables because, in general, we think of the outcome and predictors as random variables. We use lower case, for example \\(\\mathbf{X} = \\mathbf{x}\\), to denote observed values. Although, when we code, we adhere to lower case.\nThe machine learning task is to build an algorithm that returns a prediction for any of the possible values of the features. Here, we will learn several approaches to building these algorithms. Although at this point it might seem impossible to achieve this, we will start with basic examples and build up our knowledge until we can tackle more complex ones. In fact, we start with an artificially simple example with just one predictor and then move on to a slightly more realistic example with two predictors. Once we understand these, we will address real-world machine learning challenges involving many predictors."
   },
   {
     "objectID": "ml/notation-and-terminology.html#the-machine-learning-challenge",
     "href": "ml/notation-and-terminology.html#the-machine-learning-challenge",
-    "title": "\n24  Notation and Terminology\n",
-    "section": "\n24.3 The machine learning challenge",
-    "text": "24.3 The machine learning challenge\nThe general setup is as follows. We have a series of features and an unknown outcome we want to predict:\n\n\n\n\noutcome\nfeature 1\nfeature 2\nfeature 3\n\\(\\dots\\)\nfeature p\n\n\n?\n\\(X_1\\)\n\\(X_2\\)\n\\(X_3\\)\n\\(\\dots\\)\n\\(X_p\\)\n\n\n\n\nTo build a model that provides a prediction for any set of observed values \\(X_1=x_1, X_2=x_2, \\dots X_p=x_p\\), we collect data for which we know the outcome:\n\n\n\n\noutcome\nfeature 1\nfeature 2\nfeature 3\n\\(\\dots\\)\nfeature 5\n\n\n\n\\(y_{1}\\)\n\\(x_{1,1}\\)\n\\(x_{1,2}\\)\n\\(x_{1,3}\\)\n\\(\\dots\\)\n\\(x_{1,p}\\)\n\n\n\\(y_{2}\\)\n\\(x_{2,1}\\)\n\\(x_{2,2}\\)\n\\(x_{2,3}\\)\n\\(\\dots\\)\n\\(x_{2,p}\\)\n\n\n\\(\\vdots\\)\n\\(\\vdots\\)\n\\(\\vdots\\)\n\\(\\vdots\\)\n\\(\\ddots\\)\n\\(\\vdots\\)\n\n\n\\(y_n\\)\n\\(x_{n,1}\\)\n\\(x_{n,2}\\)\n\\(x_{n,3}\\)\n\\(\\dots\\)\n\\(x_{n,p}\\)\n\n\n\n\n\nWhen the output is continuous we refer to the machine learning task as prediction, and the main output of the model is a function \\(f\\) that automatically produces a prediction, denoted with \\(\\hat{y}\\), for any set of predictors: \\(\\hat{y} = f(x_1, x_2, \\dots, x_p)\\). We use the term actual outcome to denote what we ended up observing. So we want the prediction \\(\\hat{y}\\) to match the actual outcome \\(y\\) as well as possible. Because our outcome is continuous, our predictions \\(\\hat{y}\\) will not be either exactly right or wrong, but instead we will determine an error defined as the difference between the prediction and the actual outcome \\(y - \\hat{y}\\).\nWhen the outcome is categorical, we refer to the machine learning task as classification, and the main output of the model will be a decision rule which prescribes which of the \\(K\\) classes we should predict. In this scenario, most models provide functions of the predictors for each class \\(k\\), \\(f_k(x_1, x_2, \\dots, x_p)\\), that are used to make this decision. When the data is binary a typical decision rules looks like this: if \\(f_1(x_1, x_2, \\dots, x_p) &gt; C\\), predict category 1, if not the other category, with \\(C\\) a predetermined cutoff. Because the outcomes are categorical, our predictions will be either right or wrong.\nNotice that these terms vary among courses, text books, and other publications. Often prediction is used for both categorical and continuous outcomes, and the term regression can be used for the continuous case. Here we avoid using regression to avoid confusion with our previous use of the term linear regression. In most cases it will be clear if our outcomes are categorical or continuous, so we will avoid using these terms when possible."
+    "title": "\n25  Notation and terminology\n",
+    "section": "\n25.3 The machine learning challenge",
+    "text": "25.3 The machine learning challenge\nThe general setup is as follows. We have a series of features and an unknown outcome we want to predict:\n\n\n\n\noutcome\nfeature 1\nfeature 2\nfeature 3\n\\(\\dots\\)\nfeature p\n\n\n?\n\\(X_1\\)\n\\(X_2\\)\n\\(X_3\\)\n\\(\\dots\\)\n\\(X_p\\)\n\n\n\n\nTo build a model that provides a prediction for any set of observed values \\(X_1=x_1, X_2=x_2, \\dots X_p=x_p\\), we collect data for which we know the outcome:\n\n\n\n\noutcome\nfeature 1\nfeature 2\nfeature 3\n\\(\\dots\\)\nfeature 5\n\n\n\n\\(y_{1}\\)\n\\(x_{1,1}\\)\n\\(x_{1,2}\\)\n\\(x_{1,3}\\)\n\\(\\dots\\)\n\\(x_{1,p}\\)\n\n\n\\(y_{2}\\)\n\\(x_{2,1}\\)\n\\(x_{2,2}\\)\n\\(x_{2,3}\\)\n\\(\\dots\\)\n\\(x_{2,p}\\)\n\n\n\\(\\vdots\\)\n\\(\\vdots\\)\n\\(\\vdots\\)\n\\(\\vdots\\)\n\\(\\ddots\\)\n\\(\\vdots\\)\n\n\n\\(y_n\\)\n\\(x_{n,1}\\)\n\\(x_{n,2}\\)\n\\(x_{n,3}\\)\n\\(\\dots\\)\n\\(x_{n,p}\\)\n\n\n\n\n\nWhen the output is continuous, we refer to the machine learning task as prediction, and the main output of the model is a function \\(f\\) that automatically produces a prediction, denoted with \\(\\hat{y}\\), for any set of predictors: \\(\\hat{y} = f(x_1, x_2, \\dots, x_p)\\). We use the term actual outcome to denote what we end up observing. So we want the prediction \\(\\hat{y}\\) to match the actual outcome \\(y\\) as best as possible. Because our outcome is continuous, our predictions \\(\\hat{y}\\) will not be either exactly right or wrong, but instead we will determine an error defined as the difference between the prediction and the actual outcome \\(y - \\hat{y}\\).\nWhen the outcome is categorical, we refer to the machine learning task as classification, and the main output of the model will be a decision rule which prescribes which of the \\(K\\) classes we should predict. In this scenario, most models provide functions of the predictors for each class \\(k\\), \\(f_k(x_1, x_2, \\dots, x_p)\\), that are used to make this decision. When the data is binary, a typical decision rules looks like this: if \\(f_1(x_1, x_2, \\dots, x_p) &gt; C\\), predict category 1, if not the other category, with \\(C\\) a predetermined cutoff. Because the outcomes are categorical, our predictions will be either right or wrong.\nNotice that these terms vary among courses, textbooks, and other publications. Often prediction is used for both categorical and continuous outcomes, and the term regression can be used for the continuous case. Here we avoid using regression to avoid confusion with our previous use of the term linear regression. In most cases, it will be clear if our outcomes are categorical or continuous, so we will avoid using these terms when possible."
   },
   {
     "objectID": "ml/evaluation-metrics.html#sec-training-test",
     "href": "ml/evaluation-metrics.html#sec-training-test",
-    "title": "\n25  Evaluation metrics\n",
-    "section": "\n25.1 Training and test sets",
-    "text": "25.1 Training and test sets\nUltimately, a machine learning algorithm is evaluated on how it performs in the real world with completely new datasets. However, when developing an algorithm, we usually have a dataset for which we know the outcomes, as we do with the heights: we know the sex of every student in our dataset. Therefore, to mimic the ultimate evaluation process, we typically split the data into two parts and act as if we don’t know the outcome for one of these. We stop pretending we don’t know the outcome to evaluate the algorithm, but only after we are done constructing it. We refer to the group for which we know the outcome, and use to develop the algorithm, as the training set. We refer to the group for which we pretend we don’t know the outcome as the test set.\nA standard way of generating the training and test sets is by randomly splitting the data. The caret package includes the function createDataPartition that helps us generates indexes for randomly splitting the data into training and test sets:\n\nset.seed(2007)\ntest_index &lt;- createDataPartition(y, times = 1, p = 0.5, list = FALSE)\n\nThe argument times is used to define how many random samples of indexes to return, the argument p is used to define what proportion of the data is represented by the index, and the argument list is used to decide if we want the indexes returned as a list or not. We can use the result of the createDataPartition function call to define the training and test sets like this:\n\ntest_set &lt;- heights[test_index, ]\ntrain_set &lt;- heights[-test_index, ]\n\nWe will now develop an algorithm using only the training set. Once we are done developing the algorithm, we will freeze it and evaluate it using the test set. The simplest way to evaluate the algorithm when the outcomes are categorical is by simply reporting the proportion of cases that were correctly predicted in the test set. This metric is usually referred to as overall accuracy."
+    "title": "26  Evaluation metrics",
+    "section": "\n26.1 Training and test sets",
+    "text": "26.1 Training and test sets\nUltimately, a machine learning algorithm is evaluated on how it performs in the real world with completely new datasets. However, when developing an algorithm, we usually have a dataset for which we know the outcomes, as we do with the heights: we know the sex of every student in our dataset. Therefore, to mimic the ultimate evaluation process, we typically split the data into two parts and act as if we don’t know the outcome for one of these. We stop pretending we don’t know the outcome to evaluate the algorithm, but only after we are done constructing it. We refer to the group for which we know the outcome, and that we use to develop the algorithm, as the training set. We refer to the group for which we pretend we don’t know the outcome as the test set.\nA standard way of generating the training and test sets is by randomly splitting the data. The caret package includes the function createDataPartition that helps us generate indexes for randomly splitting the data into training and test sets:\n\nset.seed(2007)\ntest_index &lt;- createDataPartition(y, times = 1, p = 0.5, list = FALSE)\n\nThe argument times is used to define how many random samples of indexes to return, the argument p is used to define what proportion of the data is represented by the index, and the argument list is used to decide if we want the indexes returned as a list or not. We can use the result of the createDataPartition function call to define the training and test sets as follows:\n\ntest_set &lt;- heights[test_index, ]\ntrain_set &lt;- heights[-test_index, ]\n\nWe will now develop an algorithm using only the training set. Once we are done developing the algorithm, we will freeze it and evaluate it using the test set. The simplest way to evaluate the algorithm when the outcomes are categorical is by simply reporting the proportion of cases that were correctly predicted in the test set. This metric is usually referred to as overall accuracy."
   },
   {
     "objectID": "ml/evaluation-metrics.html#overall-accuracy",
     "href": "ml/evaluation-metrics.html#overall-accuracy",
-    "title": "\n25  Evaluation metrics\n",
-    "section": "\n25.2 Overall accuracy",
-    "text": "25.2 Overall accuracy\nTo demonstrate the use of overall accuracy, we will build two competing algorithms and compare them.\nLet’s start by developing the simplest possible machine algorithm: guessing the outcome.\n\ny_hat &lt;- sample(c(\"Male\", \"Female\"), length(test_index), replace = TRUE)\n\nNote that we are completely ignoring the predictor and simply guessing the sex.\nIn machine learning applications, it is useful to use factors to represent the categorical outcomes because R functions developed for machine learning, such as those in the caret package, require or recommend that categorical outcomes be coded as factors. So convert y_hat to factors using the factor function:\n\ny_hat &lt;- sample(c(\"Male\", \"Female\"), length(test_index), replace = TRUE) |&gt;\n  factor(levels = levels(test_set$sex))\n\nThe overall accuracy is simply defined as the overall proportion that is predicted correctly:\n\nmean(y_hat == test_set$sex)\n#&gt; [1] 0.51\n\nNot surprisingly, our accuracy is about 50%. We are guessing!\nCan we do better? Exploratory data analysis suggests we can because, on average, males are slightly taller than females:\n\nheights |&gt; group_by(sex) |&gt; summarize(mean(height), sd(height))\n#&gt; # A tibble: 2 × 3\n#&gt;   sex    `mean(height)` `sd(height)`\n#&gt;   &lt;fct&gt;           &lt;dbl&gt;        &lt;dbl&gt;\n#&gt; 1 Female           64.9         3.76\n#&gt; 2 Male             69.3         3.61\n\nBut how do we make use of this insight? Let’s try another simple approach: predict Male if height is within two standard deviations from the average male:\n\ny_hat &lt;- ifelse(x &gt; 62, \"Male\", \"Female\") |&gt; \n  factor(levels = levels(test_set$sex))\n\nThe accuracy goes up from 0.50 to about 0.80:\n\nmean(y == y_hat)\n#&gt; [1] 0.793\n\nBut can we do even better? In the example above, we used a cutoff of 62, but we can examine the accuracy obtained for other cutoffs and then pick the value that provides the best results. But remember, it is important that we optimize the cutoff using only the training set: the test set is only for evaluation. Although for this simplistic example it is not much of a problem, later we will learn that evaluating an algorithm on the training set can lead to overfitting, which often results in dangerously over-optimistic assessments.\nHere we examine the accuracy of 10 different cutoffs and pick the one yielding the best result:\n\ncutoff &lt;- seq(61, 70)\naccuracy &lt;- map_dbl(cutoff, function(x){\n  y_hat &lt;- ifelse(train_set$height &gt; x, \"Male\", \"Female\") |&gt; \n    factor(levels = levels(test_set$sex))\n  mean(y_hat == train_set$sex)\n})\n\nWe can make a plot showing the accuracy obtained on the training set for males and females:\n\n\n\n\n\n\n\n\nWe see that the maximum value is:\n\nmax(accuracy)\n#&gt; [1] 0.85\n\nwhich is much higher than 0.5. The cutoff resulting in this accuracy is:\n\nbest_cutoff &lt;- cutoff[which.max(accuracy)]\nbest_cutoff\n#&gt; [1] 64\n\nWe can now test this cutoff on our test set to make sure our accuracy is not overly optimistic:\n\ny_hat &lt;- ifelse(test_set$height &gt; best_cutoff, \"Male\", \"Female\") |&gt; \n  factor(levels = levels(test_set$sex))\ny_hat &lt;- factor(y_hat)\nmean(y_hat == test_set$sex)\n#&gt; [1] 0.804\n\nWe see that it is a bit lower than the accuracy observed for the training set, but it is still better than guessing. And by testing on a dataset that we did not train on, we know our result is not due to cherry-picking a good result."
+    "title": "26  Evaluation metrics",
+    "section": "\n26.2 Overall accuracy",
+    "text": "26.2 Overall accuracy\nTo demonstrate the use of overall accuracy, we will build two competing algorithms and compare them.\nLet’s start by developing the simplest possible machine algorithm: guessing the outcome.\n\ny_hat &lt;- sample(c(\"Male\", \"Female\"), length(test_index), replace = TRUE)\n\nNote that we are completely ignoring the predictor and simply guessing the sex.\nIn machine learning applications, it is useful to use factors to represent the categorical outcomes because R functions developed for machine learning, such as those in the caret package, require or recommend that categorical outcomes be coded as factors. So convert y_hat to factors using the factor function:\n\ny_hat &lt;- sample(c(\"Male\", \"Female\"), length(test_index), replace = TRUE) |&gt;\n  factor(levels = levels(test_set$sex))\n\nThe overall accuracy is simply defined as the overall proportion that is predicted correctly:\n\nmean(y_hat == test_set$sex)\n#&gt; [1] 0.51\n\nNot surprisingly, our accuracy is about 50%. We are guessing!\nCan we do better? Exploratory data analysis suggests we can because, on average, males are slightly taller than females:\n\nlibrary(tidyverse)\nheights |&gt; group_by(sex) |&gt; summarize(avg = mean(height), sd = sd(height))\n#&gt; # A tibble: 2 × 3\n#&gt;   sex      avg    sd\n#&gt;   &lt;fct&gt;  &lt;dbl&gt; &lt;dbl&gt;\n#&gt; 1 Female  64.9  3.76\n#&gt; 2 Male    69.3  3.61\n\nBut how do we make use of this insight? Let’s try another simple approach: predict Male if height is within two standard deviations from the average male.\n\ny_hat &lt;- factor(ifelse(x &gt; 62, \"Male\", \"Female\"), levels(test_set$sex))\n\nThe accuracy goes up from 0.50 to about 0.80:\n\nmean(y == y_hat)\n#&gt; [1] 0.793\n\nBut can we do even better? In the example above, we used a cutoff of 62, but we can examine the accuracy obtained for other cutoffs and then pick the value that provides the best results. But remember, it is important that we optimize the cutoff using only the training set: the test set is only for evaluation. Although for this simplistic example it is not much of a problem, later we will learn that evaluating an algorithm on the training set can lead to overfitting, which often results in dangerously over-optimistic assessments.\nHere we examine the accuracy of 10 different cutoffs and pick the one yielding the best result:\n\ncutoff &lt;- seq(61, 70)\naccuracy &lt;- sapply(cutoff, function(x){\n  y_hat &lt;- factor(ifelse(train_set$height &gt; x, \"Male\", \"Female\"), levels = levels(test_set$sex))\n  mean(y_hat == train_set$sex)\n})\n\nWe can make a plot showing the accuracy obtained on the training set for males and females:\n\n\n\n\n\n\n\n\nWe see that the maximum value is:\n\nmax(accuracy)\n#&gt; [1] 0.85\n\nwhich is much higher than 0.5. The cutoff resulting in this accuracy is:\n\nbest_cutoff &lt;- cutoff[which.max(accuracy)]\nbest_cutoff\n#&gt; [1] 64\n\nWe can now test this cutoff on our test set to make sure our accuracy is not overly optimistic:\n\ny_hat &lt;- ifelse(test_set$height &gt; best_cutoff, \"Male\", \"Female\") |&gt; \n  factor(levels = levels(test_set$sex))\ny_hat &lt;- factor(y_hat)\nmean(y_hat == test_set$sex)\n#&gt; [1] 0.804\n\nWe see that it is a bit lower than the accuracy observed for the training set, but it is still better than guessing. And by testing on a dataset that we did not train on, we know our result is not due to cherry-picking a good result."
   },
   {
     "objectID": "ml/evaluation-metrics.html#the-confusion-matrix",
     "href": "ml/evaluation-metrics.html#the-confusion-matrix",
-    "title": "\n25  Evaluation metrics\n",
-    "section": "\n25.3 The confusion matrix",
-    "text": "25.3 The confusion matrix\nThe prediction rule we developed in the previous section predicts Male if the student is taller than 64 inches. Given that the average female is about 64 inches, this prediction rule seems wrong. What happened? If a student is the height of the average female, shouldn’t we predict Female?\nGenerally speaking, overall accuracy can be a deceptive measure. To see this, we will start by constructing what is referred to as the confusion matrix, which basically tabulates each combination of prediction and actual value. We can do this in R using the function table:\n\ntable(predicted = y_hat, actual = test_set$sex)\n#&gt;          actual\n#&gt; predicted Female Male\n#&gt;    Female     48   32\n#&gt;    Male       71  374\n\nIf we study this table closely, it reveals a problem. If we compute the accuracy separately for each sex, we get:\n\ntest_set |&gt; \n  mutate(y_hat = y_hat) |&gt;\n  group_by(sex) |&gt; \n  summarize(accuracy = mean(y_hat == sex))\n#&gt; # A tibble: 2 × 2\n#&gt;   sex    accuracy\n#&gt;   &lt;fct&gt;     &lt;dbl&gt;\n#&gt; 1 Female    0.403\n#&gt; 2 Male      0.921\n\nThere is an imbalance in the accuracy for males and females: too many females are predicted to be male. We are calling almost half of the females male! How can our overall accuracy be so high then? This is because the prevalence of males in this dataset is high. These heights were collected from three data sciences courses, two of which had more males enrolled:\n\nprev &lt;- mean(y == \"Male\")\nprev\n#&gt; [1] 0.773\n\nSo when computing overall accuracy, the high percentage of mistakes made for females is outweighed by the gains in correct calls for men. This can actually be a big problem in machine learning. If your training data is biased in some way, you are likely to develop algorithms that are biased as well. The fact that we used a test set does not matter because it is also derived from the original biased dataset. This is one of the reasons we look at metrics other than overall accuracy when evaluating a machine learning algorithm.\nThere are several metrics that we can use to evaluate an algorithm in a way that prevalence does not cloud our assessment, and these can all be derived from the confusion matrix. A general improvement to using overall accuracy is to study sensitivity and specificity separately."
+    "title": "26  Evaluation metrics",
+    "section": "\n26.3 The confusion matrix",
+    "text": "26.3 The confusion matrix\nThe prediction rule we developed in the previous section predicts Male if the student is taller than 64 inches. Given that the average female is about 64 inches, this prediction rule seems wrong. What happened? If a student is the height of the average female, shouldn’t we predict Female?\nGenerally speaking, overall accuracy can be a deceptive measure. To see this, we will start by constructing what is referred to as the confusion matrix, which basically tabulates each combination of prediction and actual value. We can do this in R simply using table(predicted = y_hat, actual = test_set$sex),\nbut the confusionMatrix caret package computes the confusion matrix and much more:\n\ncm &lt;- confusionMatrix(data = y_hat, reference = test_set$sex)\ncm$table\n#&gt;           Reference\n#&gt; Prediction Female Male\n#&gt;     Female     48   32\n#&gt;     Male       71  374\n\nIf we study this table closely, it reveals a problem. If we compute the accuracy separately for each sex, we get:\n\ncm$byClass[c(\"Sensitivity\", \"Specificity\")]\n#&gt; Sensitivity Specificity \n#&gt;       0.403       0.921\n\nIn the next section, we explain that these two are equivalent to accuracy with females and males, respectively.\nWe notice an imbalance: too many females are predicted to be male. We are calling almost half of the females male! How can our overall accuracy be so high then? This is because the prevalence of males in this dataset is high. These heights were collected from three data sciences courses, two of which had higher male enrollment:\n\ncm$byClass[\"Prevalence\"]\n#&gt; Prevalence \n#&gt;      0.227\n\nSo when computing overall accuracy, the high percentage of mistakes made for females is outweighed by the gains in correct calls for men. This type of bias can actually be a big problem in practice. If your training data is biased in some way, you are likely to develop algorithms that are biased as well. The fact that we used a test set does not matter because it is also derived from the original biased dataset. This is one of the reasons we look at metrics other than overall accuracy when evaluating a machine learning algorithm.\nThere are several metrics that we can use to evaluate an algorithm in a way that prevalence does not cloud our assessment, and these can all be derived from the confusion matrix. A general improvement to using overall accuracy is to study sensitivity and specificity separately."
   },
   {
-    "objectID": "ml/evaluation-metrics.html#sensitivity-and-specificity",
-    "href": "ml/evaluation-metrics.html#sensitivity-and-specificity",
-    "title": "\n25  Evaluation metrics\n",
-    "section": "\n25.4 Sensitivity and specificity",
-    "text": "25.4 Sensitivity and specificity\nTo define sensitivity and specificity, we need a binary outcome. When the outcomes are categorical, we can define these terms for a specific category. In the digits example, we can ask for the specificity in the case of correctly predicting 2 as opposed to some other digit. Once we specify a category of interest, then we can talk about positive outcomes, \\(Y=1\\), and negative outcomes, \\(Y=0\\).\nIn general, sensitivity is defined as the ability of an algorithm to predict a positive outcome when the actual outcome is positive: \\(\\hat{Y}=1\\) when \\(Y=1\\). Because an algorithm that calls everything positive (\\(\\hat{Y}=1\\) no matter what) has perfect sensitivity, this metric on its own is not enough to judge an algorithm. For this reason, we also examine specificity, which is generally defined as the ability of an algorithm to not predict a positive \\(\\hat{Y}=0\\) when the actual outcome is not a positive \\(Y=0\\). We can summarize in the following way:\n\nHigh sensitivity: \\(Y=1 \\implies \\hat{Y}=1\\)\n\nHigh specificity: \\(Y=0 \\implies \\hat{Y} = 0\\)\n\n\nAlthough the above is often considered the definition of specificity, another way to think of specificity is by the proportion of positive calls that are actually positive:\n\nHigh specificity: \\(\\hat{Y}=1 \\implies Y=1\\).\n\nTo provide precise definitions, we name the four entries of the confusion matrix:\n\n\n\n\n\nActually Positive\nActually Negative\n\n\n\nPredicted positive\nTrue positives (TP)\nFalse positives (FP)\n\n\nPredicted negative\nFalse negatives (FN)\nTrue negatives (TN)\n\n\n\n\n\nSensitivity is typically quantified by \\(TP/(TP+FN)\\), the proportion of actual positives (the first column = \\(TP+FN\\)) that are called positives (\\(TP\\)). This quantity is referred to as the true positive rate (TPR) or recall.\nSpecificity is defined as \\(TN/(TN+FP)\\) or the proportion of negatives (the second column = \\(FP+TN\\)) that are called negatives (\\(TN\\)). This quantity is also called the true negative rate (TNR). There is another way of quantifying specificity which is \\(TP/(TP+FP)\\) or the proportion of outcomes called positives (the first row or \\(TP+FP\\)) that are actually positives (\\(TP\\)). This quantity is referred to as positive predictive value (PPV) and also as precision. Note that, unlike TPR and TNR, precision depends on prevalence since higher prevalence implies you can get higher precision even when guessing.\nThe multiple names can be confusing, so we include a table to help us remember the terms. The table includes a column that shows the definition if we think of the proportions as probabilities.\n\n\n\n\n\n\n\n\n\nMeasure of\nName 1\nName 2\nDefinition\nProbability representation\n\n\n\nsensitivity\nTPR\nRecall\n\\(\\frac{\\mbox{TP}}{\\mbox{TP} + \\mbox{FN}}\\)\n\\(\\mbox{Pr}(\\hat{Y}=1 \\mid Y=1)\\)\n\n\nspecificity\nTNR\n1-FPR\n\\(\\frac{\\mbox{TN}}{\\mbox{TN}+\\mbox{FP}}\\)\n\\(\\mbox{Pr}(\\hat{Y}=0 \\mid Y=0)\\)\n\n\nspecificity\nPPV\nPrecision\n\\(\\frac{\\mbox{TP}}{\\mbox{TP}+\\mbox{FP}}\\)\n\\(\\mbox{Pr}(Y=1 \\mid \\hat{Y}=1)\\)\n\n\n\nHere TPR is True Positive Rate, FPR is False Positive Rate, and PPV is Positive Predictive Value. The caret function confusionMatrix computes all these metrics for us once we define what category “positive” is. The function expects factors as input, and the first level is considered the positive outcome or \\(Y=1\\). In our example, Female is the first level because it comes before Male alphabetically. If you type this into R you will see several metrics including accuracy, sensitivity, specificity, and PPV.\n\ncm &lt;- confusionMatrix(data = y_hat, reference = test_set$sex)\n\nYou can acceess these directly, for example, like this:\n\ncm$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.804\ncm$byClass[c(\"Sensitivity\",\"Specificity\", \"Prevalence\")]\n#&gt; Sensitivity Specificity  Prevalence \n#&gt;       0.403       0.921       0.227\n\nWe can see that the high overall accuracy is possible despite relatively low sensitivity. As we hinted at above, the reason this happens is because of the low prevalence (0.23): the proportion of females is low. Because prevalence is low, failing to predict actual females as females (low sensitivity) does not lower the accuracy as much as failing to predict actual males as males (low specificity). This is an example of why it is important to examine sensitivity and specificity and not just accuracy. Before applying this algorithm to general datasets, we need to ask ourselves if prevalence will be the same."
+    "objectID": "ml/evaluation-metrics.html#sec-senistivity-and-specificity",
+    "href": "ml/evaluation-metrics.html#sec-senistivity-and-specificity",
+    "title": "26  Evaluation metrics",
+    "section": "\n26.4 Sensitivity and specificity",
+    "text": "26.4 Sensitivity and specificity\nTo define sensitivity and specificity, we need a binary outcome. When the outcomes are categorical, we can define these terms for a specific category. In the digits example, we can ask for the specificity in the case of correctly predicting 2 as opposed to some other digit. Once we specify a category of interest, then we can talk about positive outcomes, \\(Y=1\\), and negative outcomes, \\(Y=0\\).\nIn general, sensitivity is defined as the ability of an algorithm to predict a positive outcome when the actual outcome is positive: \\(\\hat{Y}=1\\) when \\(Y=1\\). Because an algorithm that calls everything positive (\\(\\hat{Y}=1\\) no matter what) has perfect sensitivity, this metric on its own is not enough to judge an algorithm. For this reason, we also examine specificity, which is generally defined as the ability of an algorithm to not predict a positive \\(\\hat{Y}=0\\) when the actual outcome is not a positive \\(Y=0\\). We can summarize in the following way:\n\nHigh sensitivity: \\(Y=1 \\implies \\hat{Y}=1\\)\n\nHigh specificity: \\(Y=0 \\implies \\hat{Y} = 0\\)\n\n\nAlthough the above is often considered the definition of specificity, another way to think of specificity is by the proportion of positive calls that are actually positive:\n\nHigh specificity: \\(\\hat{Y}=1 \\implies Y=1\\).\n\nTo provide precise definitions, we name the four entries of the confusion matrix:\n\n\n\n\n\nActually Positive\nActually Negative\n\n\n\nPredicted positive\nTrue positives (TP)\nFalse positives (FP)\n\n\nPredicted negative\nFalse negatives (FN)\nTrue negatives (TN)\n\n\n\n\n\nSensitivity is typically quantified by \\(TP/(TP+FN)\\), the proportion of actual positives (the first column = \\(TP+FN\\)) that are called positives (\\(TP\\)). This quantity is referred to as the true positive rate (TPR) or recall.\nSpecificity is defined as \\(TN/(TN+FP)\\) or the proportion of negatives (the second column = \\(FP+TN\\)) that are called negatives (\\(TN\\)). This quantity is also called the true negative rate (TNR). There is another way of quantifying specificity which is \\(TP/(TP+FP)\\) or the proportion of outcomes called positives (the first row or \\(TP+FP\\)) that are actually positives (\\(TP\\)). This quantity is referred to as positive predictive value (PPV) and also as precision. Note that, unlike TPR and TNR, precision depends on prevalence since higher prevalence implies you can get higher precision even when guessing.\nThe multiple names can be confusing, so we include a table to help us remember the terms. The table includes a column that shows the definition if we think of the proportions as probabilities.\n\n\n\n\n\n\n\n\n\nMeasure of\nName 1\nName 2\nDefinition\nProbability representation\n\n\n\nsensitivity\nTPR\nRecall\n\\(\\frac{\\mbox{TP}}{\\mbox{TP} + \\mbox{FN}}\\)\n\\(\\mbox{Pr}(\\hat{Y}=1 \\mid Y=1)\\)\n\n\nspecificity\nTNR\n1-FPR\n\\(\\frac{\\mbox{TN}}{\\mbox{TN}+\\mbox{FP}}\\)\n\\(\\mbox{Pr}(\\hat{Y}=0 \\mid Y=0)\\)\n\n\nspecificity\nPPV\nPrecision\n\\(\\frac{\\mbox{TP}}{\\mbox{TP}+\\mbox{FP}}\\)\n\\(\\mbox{Pr}(Y=1 \\mid \\hat{Y}=1)\\)\n\n\n\nThe caret function confusionMatrix computes all these metrics for us once we define which category is the “positive” (Y=1). The function expects factors as input, and the first level is considered the positive outcome or \\(Y=1\\). In our example, Female is the first level because it comes before Male alphabetically. If you type this into R, you will see several metrics including accuracy, sensitivity, specificity, and PPV.\nYou can access these directly, for example, like this:\n\ncm$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.804\ncm$byClass[c(\"Sensitivity\",\"Specificity\", \"Prevalence\")]\n#&gt; Sensitivity Specificity  Prevalence \n#&gt;       0.403       0.921       0.227\n\nWe can see that the high overall accuracy is possible despite relatively low sensitivity. As we hinted at above, the reason this happens is because of the low prevalence (0.23): the proportion of females is low. Because prevalence is low, failing to predict actual females as females (low sensitivity) does not lower the overall accuracy as much as failing to predict actual males as males (low specificity). This is an example of why it is important to examine sensitivity and specificity and not just accuracy. Before applying this algorithm to general datasets, we need to ask ourselves if prevalence will be the same."
   },
   {
     "objectID": "ml/evaluation-metrics.html#balanced-accuracy-and-f_1-score",
     "href": "ml/evaluation-metrics.html#balanced-accuracy-and-f_1-score",
-    "title": "\n25  Evaluation metrics\n",
-    "section": "\n25.5 Balanced accuracy and \\(F_1\\) score",
-    "text": "25.5 Balanced accuracy and \\(F_1\\) score\nAlthough we usually recommend studying both specificity and sensitivity, very often it is useful to have a one-number summary, for example for optimization purposes. One metric that is preferred over overall accuracy is the average of specificity and sensitivity, referred to as balanced accuracy. Because specificity and sensitivity are rates, it is more appropriate to compute the harmonic average. In fact, the \\(F_1\\)-score, a widely used one-number summary, is the harmonic average of precision and recall:\n\\[\n\\frac{1}{\\frac{1}{2}\\left(\\frac{1}{\\mbox{recall}} +\n    \\frac{1}{\\mbox{precision}}\\right) }\n\\]\nBecause it is easier to write, you often see this harmonic average rewritten as:\n\\[\n2 \\times \\frac{\\mbox{precision} \\cdot \\mbox{recall}}\n{\\mbox{precision} + \\mbox{recall}}\n\\]\nwhen defining \\(F_1\\).\nRemember that, depending on the context, some types of errors are more costly than others. For example, in the case of plane safety, it is much more important to maximize sensitivity over specificity: failing to predict a plane will malfunction before it crashes is a much more costly error than grounding a plane when, in fact, the plane is in perfect condition. In a capital murder criminal case, the opposite is true since a false positive can lead to executing an innocent person. The \\(F_1\\)-score can be adapted to weigh specificity and sensitivity differently. To do this, we define \\(\\beta\\) to represent how much more important sensitivity is compared to specificity and consider a weighted harmonic average:\n\\[\n\\frac{1}{\\frac{\\beta^2}{1+\\beta^2}\\frac{1}{\\mbox{recall}} +\n    \\frac{1}{1+\\beta^2}\\frac{1}{\\mbox{precision}} }\n\\]\nThe F_meas function in the caret package computes this summary with beta defaulting to 1.\nLet’s rebuild our prediction algorithm, but this time maximizing the F-score instead of overall accuracy:\n\ncutoff &lt;- seq(61, 70)\nF_1 &lt;- map_dbl(cutoff, function(x){\n  y_hat &lt;- ifelse(train_set$height &gt; x, \"Male\", \"Female\") |&gt; \n    factor(levels = levels(test_set$sex))\n  F_meas(data = y_hat, reference = factor(train_set$sex))\n})\n\nAs before, we can plot these \\(F_1\\) measures versus the cutoffs:\n\n\n\n\n\n\n\n\nWe see that it is maximized at \\(F_1\\) value of:\n\nmax(F_1)\n#&gt; [1] 0.647\n\nThis maximum is achieved when we use the following cutoff:\n\nbest_cutoff &lt;- cutoff[which.max(F_1)]\nbest_cutoff\n#&gt; [1] 66\n\nA cutoff of 66 makes more sense than 64. Furthermore, it balances the specificity and sensitivity of our confusion matrix:\n\ny_hat &lt;- ifelse(test_set$height &gt; best_cutoff, \"Male\", \"Female\") |&gt; \n  factor(levels = levels(test_set$sex))\nsensitivity(data = y_hat, reference = test_set$sex)\n#&gt; [1] 0.63\nspecificity(data = y_hat, reference = test_set$sex)\n#&gt; [1] 0.833\n\nWe now see that we do much better than guessing, that both sensitivity and specificity are relatively high, and that we have built our first machine learning algorithm. It takes height as a predictor and predicts female if you are 65 inches or shorter."
+    "title": "26  Evaluation metrics",
+    "section": "\n26.5 Balanced accuracy and \\(F_1\\) score",
+    "text": "26.5 Balanced accuracy and \\(F_1\\) score\nAlthough we usually recommend studying both specificity and sensitivity, often it is useful to have a one-number summary, for example, for optimization purposes. One metric that is preferred over overall accuracy is the average of specificity and sensitivity, referred to as balanced accuracy. Because specificity and sensitivity are rates, it is more appropriate to compute the harmonic average. In fact, the \\(F_1\\)-score, a widely used one-number summary, is the harmonic average of precision and recall:\n\\[\n\\frac{1}{\\frac{1}{2}\\left(\\frac{1}{\\mbox{recall}} +\n    \\frac{1}{\\mbox{precision}}\\right) }\n\\]\nBecause it is easier to write, you often see this harmonic average rewritten as:\n\\[\n2 \\times \\frac{\\mbox{precision} \\cdot \\mbox{recall}}\n{\\mbox{precision} + \\mbox{recall}}\n\\]\nwhen defining \\(F_1\\).\nRemember that, depending on the context, some types of errors are more costly than others. For instance, in the case of plane safety, it is much more important to maximize sensitivity over specificity: failing to predict a plane will malfunction before it crashes is a much more costly error than grounding a plane when, in fact, the plane is in perfect condition. In a capital murder criminal case, the opposite is true since a false positive can lead to executing an innocent person. The \\(F_1\\)-score can be adapted to weigh specificity and sensitivity differently. To do this, we define \\(\\beta\\) to represent how much more important sensitivity is compared to specificity and consider a weighted harmonic average:\n\\[\n\\frac{1}{\\frac{\\beta^2}{1+\\beta^2}\\frac{1}{\\mbox{recall}} +\n    \\frac{1}{1+\\beta^2}\\frac{1}{\\mbox{precision}} }\n\\]\nThe F_meas function in the caret package computes this summary with beta defaulting to 1.\nLet’s rebuild our prediction algorithm, but this time maximizing the F-score instead of overall accuracy:\n\ncutoff &lt;- seq(61, 70)\nF_1 &lt;- sapply(cutoff, function(x){\n  y_hat &lt;- factor(ifelse(train_set$height &gt; x, \"Male\", \"Female\"), levels(test_set$sex))\n  F_meas(data = y_hat, reference = factor(train_set$sex))\n})\n\nAs before, we can plot these \\(F_1\\) measures versus the cutoffs:\n\n\n\n\n\n\n\n\nWe see that it is maximized at \\(F_1\\) value of:\n\nmax(F_1)\n#&gt; [1] 0.647\n\nThis maximum is achieved when we use the following cutoff:\n\nbest_cutoff &lt;- cutoff[which.max(F_1)]\nbest_cutoff\n#&gt; [1] 66\n\nA cutoff of 66 makes more sense than 64. Furthermore, it balances the specificity and sensitivity of our confusion matrix:\n\ny_hat &lt;- ifelse(test_set$height &gt; best_cutoff, \"Male\", \"Female\") |&gt; \n  factor(levels = levels(test_set$sex))\nsensitivity(data = y_hat, reference = test_set$sex)\n#&gt; [1] 0.63\nspecificity(data = y_hat, reference = test_set$sex)\n#&gt; [1] 0.833\n\nWe now see that we do much better than guessing, that both sensitivity and specificity are relatively high."
   },
   {
     "objectID": "ml/evaluation-metrics.html#prevalence-matters-in-practice",
     "href": "ml/evaluation-metrics.html#prevalence-matters-in-practice",
-    "title": "\n25  Evaluation metrics\n",
-    "section": "\n25.6 Prevalence matters in practice",
-    "text": "25.6 Prevalence matters in practice\nA machine learning algorithm with very high sensitivity and specificity may not be useful in practice when prevalence is close to either 0 or 1. To see this, consider the case of a doctor that specializes in a rare disease and is interested in developing an algorithm for predicting who has the disease. The doctor shares data with you and you then develop an algorithm with very high sensitivity. You explain that this means that if a patient has the disease, the algorithm is very likely to predict correctly. You also tell the doctor that you are also concerned because, based on the dataset you analyzed, 1/2 the patients have the disease: \\(\\mbox{Pr}(\\hat{Y}=1)\\). The doctor is neither concerned nor impressed and explains that what is important is the precision of the test: \\(\\mbox{Pr}(Y=1 | \\hat{Y}=1)\\). Using Bayes theorem, we can connect the two measures:\n\\[ \\mbox{Pr}(Y = 1\\mid \\hat{Y}=1) = \\mbox{Pr}(\\hat{Y}=1 \\mid Y=1) \\frac{\\mbox{Pr}(Y=1)}{\\mbox{Pr}(\\hat{Y}=1)}\\]\nThe doctor knows that the prevalence of the disease is 5 in 1,000, which implies that \\(\\mbox{Pr}(Y=1) \\, / \\,\\mbox{Pr}(\\hat{Y}=1) = 1/100\\) and therefore the precision of your algorithm is less than 0.01. The doctor does not have much use for your algorithm."
+    "title": "26  Evaluation metrics",
+    "section": "\n26.6 Prevalence matters in practice",
+    "text": "26.6 Prevalence matters in practice\nA machine learning algorithm with very high TPR and TNR may not be useful in practice when prevalence is close to either 0 or 1. To see this, consider the case of a doctor that specializes in a rare disease and is interested in developing an algorithm for predicting who has the disease.\nThe doctor shares data with about 1/2 cases and 1/2 controls and some predictors. You then develop an algorithm with TPR=0.99 and TNR = 0.99. You are excited to explain to the doctor that this means that if a patient has the disease, the algorithm is very likely to predict correctly. The doctor is not impressed and explains that your TNR is too low for this algorithm to be used in practice. This is because this is a rare disease with a prevalence in the general population of 0.5%. The doctor reminds you of Bayes formula:\n\\[ \\mbox{Pr}(Y = 1\\mid \\hat{Y}=1) = \\mbox{Pr}(\\hat{Y}=1 \\mid Y=1) \\frac{\\mbox{Pr}(Y=1)}{\\mbox{Pr}(\\hat{Y}=1)} \\implies \\text{Precision} = \\text{TPR} \\times \\frac{\\text{Prevalence}}{\\text{TPR}\\times \\text{Prevalence} + \\text{FPR}\\times(1-\\text{Prevalence})} \\approx 0.33  \\]\nHere is plot of precision as a function of prevalence with TPR and TNR are 95%:\n\n\n\n\n\n\n\n\nAlthough your algorithm has a precision of about 95% on the data you train on, with prevalence of 50%, if applied to the general population, the algorithm’s precision would be just 33%. The doctor can’t use an algorithm with 33% of people receiving a positive test actually not having the disease. Note that even if your algorithm had perfect sensitivity, the precision would still be around 33%. So you need to greatly decrease your FPR for the algorithm to be useful in practice."
   },
   {
     "objectID": "ml/evaluation-metrics.html#roc-and-precision-recall-curves",
     "href": "ml/evaluation-metrics.html#roc-and-precision-recall-curves",
-    "title": "\n25  Evaluation metrics\n",
-    "section": "\n25.7 ROC and precision-recall curves",
-    "text": "25.7 ROC and precision-recall curves\nWhen comparing the two methods (guessing versus using a height cutoff), we looked at accuracy and \\(F_1\\). The second method clearly outperformed the first. However, while we considered several cutoffs for the second method, for the first we only considered one approach: guessing with equal probability. Note that guessing Male with higher probability would give us higher accuracy due to the bias in the sample:\n\np &lt;- 0.9\nn &lt;- length(test_index)\ny_hat &lt;- sample(c(\"Male\", \"Female\"), n, replace = TRUE, prob = c(p, 1 - p)) |&gt; \n  factor(levels = levels(test_set$sex))\nmean(y_hat == test_set$sex)\n#&gt; [1] 0.739\n\nBut, as described above, this would come at the cost of lower sensitivity. The curves we describe in this section will help us see this.\nRemember that for each of these parameters, we can get a different sensitivity and specificity. For this reason, a very common approach to evaluating methods is to compare them graphically by plotting both.\nA widely used plot that does this is the receiver operating characteristic (ROC) curve. If you are wondering where this name comes from, you can consult the ROC Wikipedia page1.\nThe ROC curve plots sensitivity (TPR) versus 1 - specificity or the false positive rate (FPR). Here we compute the TPR and FPR needed for different probabilities of guessing male:\n\nprobs &lt;- seq(0, 1, length.out = 10)\nguessing &lt;- map_df(probs, function(p){\n  y_hat &lt;- \n    sample(c(\"Male\", \"Female\"), n, replace = TRUE, prob = c(p, 1 - p)) |&gt; \n    factor(levels = c(\"Female\", \"Male\"))\n  list(method = \"Guessing\",\n       FPR = 1 - specificity(y_hat, test_set$sex),\n       TPR = sensitivity(y_hat, test_set$sex))\n})\n\nWe can use similar code to compute these values for our our second approach. By plotting both curves together, we are able to compare sensitivity for different values of specificity:\n\n\n\n\n\n\n\n\nWe can see that we obtain higher sensitivity with this approach for all values of specificity, which implies it is in fact a better method. Note that ROC curves for guessing always fall on the identiy line. Also note that when making ROC curves, it is often nice to add the cutoff associated with each point.\nThe packages pROC and plotROC are useful for generating these plots.\nROC curves have one weakness and it is that neither of the measures plotted depends on prevalence. In cases in which prevalence matters, we may instead make a precision-recall plot. The idea is similar, but we instead plot precision against recall:\n\n\n\n\n\n\n\n\nFrom this plot we immediately see that the precision of guessing is not high. This is because the prevalence is low. We also see that if we change positives to mean Male instead of Female, the ROC curve remains the same, but the precision recall plot changes."
+    "title": "26  Evaluation metrics",
+    "section": "\n26.7 ROC and precision-recall curves",
+    "text": "26.7 ROC and precision-recall curves\nWhen comparing the two methods (guessing versus using a height cutoff), we looked at accuracy and \\(F_1\\). The second method clearly outperformed the first. However, while we considered several cutoffs for the second method, for the first we only considered one approach: guessing with equal probability. Be aware that guessing Male with higher probability would give us higher accuracy due to the bias in the sample:\n\np &lt;- 0.9\nn &lt;- length(test_index)\ny_hat &lt;- sample(c(\"Male\", \"Female\"), n, replace = TRUE, prob = c(p, 1 - p)) |&gt; \n  factor(levels = levels(test_set$sex))\nmean(y_hat == test_set$sex)\n#&gt; [1] 0.739\n\nBut, as described above, this would come at the cost of lower sensitivity. The curves we describe in this section will help us see this.\nRemember that for each of these parameters, we can get a different sensitivity and specificity. For this reason, a very common approach to evaluating methods is to compare them graphically by plotting both.\nA widely used plot that does this is the receiver operating characteristic (ROC) curve. If you are wondering where this name comes from, you can consult the ROC Wikipedia page1.\nThe ROC curve plots sensitivity, represented as the TPR, versus 1 - specificity represented as the false positive rate (FPR). Here we compute the TPR and FPR needed for different probabilities of guessing male:\n\nprobs &lt;- seq(0, 1, length.out = 10)\nguessing &lt;- sapply(probs, function(p){\n  y_hat &lt;- \n    sample(c(\"Male\", \"Female\"), nrow(test_set), TRUE, c(p, 1 - p)) |&gt; \n    factor(levels = c(\"Female\", \"Male\"))\n  c(FPR = 1 - specificity(y_hat, test_set$sex),\n    TPR = sensitivity(y_hat, test_set$sex))\n})\n\nWe can use similar code to compute these values for our our second approach. By plotting both curves together, we are able to compare sensitivity for different values of specificity:\n\n\n\n\n\n\n\n\nWe see that we obtain higher sensitivity with this approach for all values of specificity, which implies it is in fact a better method. Keep in mind that ROC curves for guessing always fall on the identity line. Also, note that when making ROC curves, it is often nice to add the cutoff associated with each point.\nThe packages pROC and plotROC are useful for generating these plots.\nROC curves have one weakness and it is that neither of the measures plotted depends on prevalence. In cases in which prevalence matters, we may instead make a precision-recall plot. The idea is similar, but we instead plot precision against recall:\n\n\n\n\n\n\n\n\nFrom the plot on the left, we immediately see that the precision of guessing is not high. This is because the prevalence is low. From the plot on the right, we also see that if we change \\(Y=1\\) to mean Male instead of Female, the precision increases. Note that the ROC curve would remain the same."
   },
   {
-    "objectID": "ml/evaluation-metrics.html#sec-loss-function",
-    "href": "ml/evaluation-metrics.html#sec-loss-function",
-    "title": "\n25  Evaluation metrics\n",
-    "section": "\n25.8 The loss function",
-    "text": "25.8 The loss function\nUp to now we have described evaluation metrics that apply exclusively to categorical data. Specifically, for binary outcomes, we have described how sensitivity, specificity, accuracy, and \\(F_1\\) can be used as quantification. However, these metrics are not useful for continuous outcomes. In this section, we describe how the general approach to defining “best” in machine learning is to define a loss function, which can be applied to both categorical and continuous data.\nThe most commonly used loss function is the squared loss function. If \\(\\hat{y}\\) is our predictor and \\(y\\) is the observed outcome, the squared loss function is simply:\n\\[\n(\\hat{y} - y)^2\n\\]\nBecause we often have a test set with many observations, say \\(N\\), we use the mean squared error (MSE):\n\\[\n\\mbox{MSE} = \\frac{1}{N} \\mbox{RSS} = \\frac{1}{N}\\sum_{i=1}^N (\\hat{y}_i - y_i)^2\n\\]\nIn practice, we often report the root mean squared error (RMSE), which is \\(\\sqrt{\\mbox{MSE}}\\), because it is in the same units as the outcomes. But doing the math is often easier with the MSE and it is therefore more commonly used in textbooks, since these usually describe theoretical properties of algorithms.\nIf the outcomes are binary, both RMSE and MSE are equivalent to one minus accuracy, since \\((\\hat{y} - y)^2\\) is 0 if the prediction was correct and 1 otherwise. In general, our goal is to build an algorithm that minimizes the loss so it is as close to 0 as possible.\nBecause our data is usually a random sample, we can think of the MSE as a random variable and the observed MSE can be thought of as an estimate of the expected MSE, which in mathematical notation we write like this:\n\\[\n\\mbox{E}\\left\\{ \\frac{1}{N}\\sum_{i=1}^N (\\hat{Y}_i - Y_i)^2 \\right\\}\n\\]\nThis is a theoretical concept because in practice we only have one dataset to work with. But in theory, we think of having a very large number of random samples (call it \\(B\\)), apply our algorithm to each, obtain an MSE for each random sample, and think of the expected MSE as:\n\\[\n\\frac{1}{B} \\sum_{b=1}^B \\frac{1}{N}\\sum_{i=1}^N \\left(\\hat{y}_i^b - y_i^b\\right)^2\n\\]\nwith \\(y_{i}^b\\) denoting the \\(i\\)th observation in the \\(b\\)th random sample and \\(\\hat{y}_i^b\\) the resulting prediction obtained from applying the exact same algorithm to the \\(b\\)th random sample. Again, in practice we only observe one random sample, so the expected MSE is only theoretical. However, in Chapter Chapter 28 we describe an approach to estimating the MSE that tries to mimic this theoretical quantity.\nNote that there are loss functions other than the squared loss. For example, the Mean Absolute Error uses absolute values, \\(|\\hat{Y}_i - Y_i|\\) instead of squaring the errors \\((\\hat{Y}_i - Y_i)^2\\). However, in this book we focus on minimizing square loss since it is the most widely used."
+    "objectID": "ml/evaluation-metrics.html#sec-mse",
+    "href": "ml/evaluation-metrics.html#sec-mse",
+    "title": "26  Evaluation metrics",
+    "section": "\n26.8 Mean Squared Error",
+    "text": "26.8 Mean Squared Error\nUp to now we have described evaluation metrics that apply exclusively to categorical data. Specifically, for binary outcomes, we have described how sensitivity, specificity, accuracy, and \\(F_1\\) can be used as quantification. However, these metrics are not useful for continuous outcomes.\nIn this section, we describe how the general approach to defining “best” in machine learning is to define a loss function, which can be applied to both categorical and continuous data.\nThe most commonly used loss function is the squared loss function. If \\(\\hat{y}\\) is our predictor and \\(y\\) is the observed outcome, the squared loss function is simply: \\((\\hat{y} - y)^2\\).\nBecause we often model \\(y\\) as the outcome of a random process, theoretically, it does not make sense to compare algorithms based on \\((\\hat{y} - y)^2\\) as the minimum can change from sample to sample. For this reason, we minimize mean squared error (MSE):\n\\[\n\\text{MSE} \\equiv \\mbox{E}\\{(\\hat{Y} - Y)^2 \\}\n\\]\nConsider that if the outcomes are binary, the MSE is equivalent to one minus expected accuracy, since \\((\\hat{y} - y)^2\\) is 0 if the prediction was correct and 1 otherwise.\nDifferent algorithms will result in different predictions \\(\\hat{Y}\\), and therefore different MSE. In general, our goal is to build an algorithm that minimizes the loss so it is as close to 0 as possible.\nHowever, note that the MSE is a theoretical quantity. How do we estimate this? Because in practice we have tests set with many, say \\(N\\), independent observations, a commonly used observable estimate of the MSE is:\n\\[\n\\hat{\\mbox{MSE}} = \\frac{1}{N}\\sum_{i=1}^N (\\hat{y}_i - y_i)^2\n\\]\nwith the \\(\\hat{y}_i\\) generated completely independently from the the \\(y_i\\).\n\n\n\n\n\n\nIn practice, we often report the root mean squared error (RMSE), which is simply \\(\\sqrt{\\mbox{MSE}}\\), because it is in the same units as the outcomes.\n\n\n\nHowever, the estimate \\(\\hat{\\text{MSE}}\\) is a random variable. In fact, \\(\\text{MSE}\\) and \\(\\hat{\\text{MSE}}\\) are often referred to as the true error and apparent error, respectively. Due to the complexity of some machine learning, it is difficult to derive the statistical properties of how well the apparent error estimates the true error. In Chapter 29, we introduce cross-validation an approach to estimating the MSE.\nWe end this chapter by pointing out that there are loss functions other than the squared loss. For example, the Mean Absolute Error uses absolute values, \\(|\\hat{Y}_i - Y_i|\\) instead of squaring the errors \\((\\hat{Y}_i - Y_i)^2\\). However, in this book we focus on minimizing square loss since it is the most widely used."
   },
   {
     "objectID": "ml/evaluation-metrics.html#exercises",
     "href": "ml/evaluation-metrics.html#exercises",
-    "title": "\n25  Evaluation metrics\n",
-    "section": "\n25.9 Exercises",
-    "text": "25.9 Exercises\nThe reported_height and height datasets were collected from three classes taught in the Departments of Computer Science and Biostatistics, as well as remotely through the Extension School. The biostatistics class was taught in 2016 along with an online version offered by the Extension School. On 2016-01-25 at 8:15 AM, during one of the lectures, the instructors asked students to fill in the sex and height questionnaire that populated the reported_height dataset. The online students filled the survey during the next few days, after the lecture was posted online. We can use this insight to define a variable, call it type, to denote the type of student: inclass or online:\n\nlibrary(lubridate)\ndat &lt;- mutate(reported_heights, date_time = ymd_hms(time_stamp)) |&gt;\n  filter(date_time &gt;= make_date(2016, 01, 25) & \n           date_time &lt; make_date(2016, 02, 1)) |&gt;\n  mutate(type = ifelse(day(date_time) == 25 & hour(date_time) == 8 & \n                         between(minute(date_time), 15, 30),\n                       \"inclass\", \"online\")) |&gt; select(sex, type)\nx &lt;- dat$type\ny &lt;- factor(dat$sex, c(\"Female\", \"Male\"))\n\n1. Show summary statistics that indicate that the type is predictive of sex.\n2. Instead of using height to predict sex, use the type variable.\n3. Show the confusion matrix.\n4. Use the confusionMatrix function in the caret package to report accuracy.\n5. Now use the sensitivity and specificity functions to report specificity and sensitivity.\n6. What is the prevalence (% of females) in the dat dataset defined above?"
+    "title": "26  Evaluation metrics",
+    "section": "\n26.9 Exercises",
+    "text": "26.9 Exercises\nThe reported_height and height datasets were collected from three classes taught in the Departments of Computer Science and Biostatistics, as well as remotely through the Extension School. The Biostatistics class was taught in 2016 along with an online version offered by the Extension School. On 2016-01-25 at 8:15 AM, during one of the lectures, the instructors asked students to fill in the sex and height questionnaire that populated the reported_height dataset. The online students filled the survey during the next few days, after the lecture was posted online. We can use this insight to define a variable, call it type, to denote the type of student: inclass or online:\n\nlibrary(lubridate)\ndat &lt;- mutate(reported_heights, date_time = ymd_hms(time_stamp)) |&gt;\n  filter(date_time &gt;= make_date(2016, 01, 25) & \n           date_time &lt; make_date(2016, 02, 1)) |&gt;\n  mutate(type = ifelse(day(date_time) == 25 & hour(date_time) == 8 & \n                         between(minute(date_time), 15, 30),\n                       \"inclass\", \"online\")) |&gt; select(sex, type)\nx &lt;- dat$type\ny &lt;- factor(dat$sex, c(\"Female\", \"Male\"))\n\n1. Show summary statistics that indicate that the type is predictive of sex.\n2. Instead of using height to predict sex, use the type variable.\n3. Show the confusion matrix.\n4. Use the confusionMatrix function in the caret package to report accuracy.\n5. Now use the sensitivity and specificity functions to report specificity and sensitivity.\n6. What is the prevalence (% of females) in the dat dataset defined above?"
   },
   {
     "objectID": "ml/evaluation-metrics.html#footnotes",
     "href": "ml/evaluation-metrics.html#footnotes",
-    "title": "\n25  Evaluation metrics\n",
+    "title": "26  Evaluation metrics",
     "section": "",
     "text": "https://en.wikipedia.org/wiki/Receiver_operating_characteristic↩︎"
   },
   {
     "objectID": "ml/conditionals.html#conditional-probabilities",
     "href": "ml/conditionals.html#conditional-probabilities",
-    "title": "\n26  Conditional probabilities and expectations\n",
-    "section": "\n26.1 Conditional probabilities",
-    "text": "26.1 Conditional probabilities\nWe use the notation \\((X_1 = x_1,\\dots,X_p=x_p)\\) to represent the fact that we have observed values \\(x_1,\\dots,x_p\\) for covariates \\(X_1, \\dots, X_p\\). This does not imply that the outcome \\(Y\\) will take a specific value. Instead, it implies a specific probability. In particular, we denote the conditional probabilities for each class \\(k\\):\n\\[\n\\mbox{Pr}(Y=k \\mid X_1 = x_1,\\dots,X_p=x_p), \\, \\mbox{for}\\,k=1,\\dots,K\n\\]\nTo avoid writing out all the predictors, we will use the bold letters like this: \\(\\mathbf{X} \\equiv (X_1,\\dots,X_p)\\) and \\(\\mathbf{x} \\equiv (x_1,\\dots,x_p)\\). We will also use the following notation for the conditional probability of being class \\(k\\):\n\\[\np_k(\\mathbf{x}) = \\mbox{Pr}(Y=k \\mid \\mathbf{X}=\\mathbf{x}), \\, \\mbox{for}\\, k=1,\\dots,K\n\\]\nNote: We will be using the \\(p(x)\\) notation to represent conditional probabilities as functions of the predictors. Do not confuse it with the \\(p\\) that represents the number of predictors.\nThese probabilities guide the construction of an algorithm that makes the best prediction: for any given \\(\\mathbf{x}\\), we will predict the class \\(k\\) with the largest probability among \\(p_1(x), p_2(x), \\dots p_K(x)\\). In mathematical notation, we write it like this: \\(\\hat{Y} = \\max_k p_k(\\mathbf{x})\\).\nIn machine learning, we refer to this as Bayes’ Rule. But keep in mind that this is a theoretical rule since in practice we don’t know \\(p_k(\\mathbf{x}), k=1,\\dots,K\\). In fact, estimating these conditional probabilities can be thought of as the main challenge of machine learning. The better our probability estimates \\(\\hat{p}_k(\\mathbf{x})\\), the better our predictor:\n\\[\\hat{Y} = \\max_k \\hat{p}_k(\\mathbf{x})\\]\nSo what we will predict depends on two things: 1) how close are the \\(\\max_k p_k(\\mathbf{x})\\) to 1 or 0 (perfect certainty) and 2) how close our estimates \\(\\hat{p}_k(\\mathbf{x})\\) are to \\(p_k(\\mathbf{x})\\). We can’t do anything about the first restriction as it is determined by the nature of the problem, so our energy goes into finding ways to best estimate conditional probabilities. The first restriction does imply that we have limits as to how well even the best possible algorithm can perform. You should get used to the idea that while in some challenges we will be able to achieve almost perfect accuracy, with digit readers for example, in others our success is restricted by the randomness of the process, with movie recommendations for example.\nBefore we continue, it is important to remember that defining our prediction by maximizing the probability is not always optimal in practice and depends on the context. As discussed above, sensitivity and specificity may differ in importance. But even in these cases, having a good estimate of the \\(p_k(x), k=1,\\dots,K\\) will suffice for us to build optimal prediction models, since we can control the balance between specificity and sensitivity however we wish. For instance, we can simply change the cutoffs used to predict one outcome or the other. In the plane example, we may ground the plane anytime the probability of malfunction is higher than 1 in a million as opposed to the default 1/2 used when error types are equally undesired."
+    "title": "\n27  Conditional probabilities and expectations\n",
+    "section": "\n27.1 Conditional probabilities",
+    "text": "27.1 Conditional probabilities\nWe use the notation \\((X_1 = x_1,\\dots,X_p=x_p)\\) to represent the fact that we have observed values \\(x_1,\\dots,x_p\\) for covariates \\(X_1, \\dots, X_p\\). This does not imply that the outcome \\(Y\\) will take a specific value. Instead, it implies a specific probability. In particular, we denote the conditional probabilities for each class \\(k\\) with:\n\\[\n\\mbox{Pr}(Y=k \\mid X_1 = x_1,\\dots,X_p=x_p), \\, \\mbox{for}\\,k=1,\\dots,K\n\\]\nTo avoid writing out all the predictors, we will use the bold letters like this: \\(\\mathbf{X} \\equiv (X_1,\\dots,X_p)^\\top\\) and \\(\\mathbf{x} \\equiv (x_1,\\dots,x_p)^\\top\\). We will also use the following notation for the conditional probability of being class \\(k\\):\n\\[\np_k(\\mathbf{x}) = \\mbox{Pr}(Y=k \\mid \\mathbf{X}=\\mathbf{x}), \\, \\mbox{for}\\, k=1,\\dots,K\n\\] Notice that the \\(p_k(\\mathbf{x})\\) have to add up to 1 for each \\(\\mathbf{x}\\), so once we know \\(K-1\\), we know all. When the outcome is binary, we only need to know 1, so we drop the \\(k\\) and use the notation \\(p(\\mathbf{x}) = \\mbox{Pr}(Y=1 \\mid \\mathbf{X}=\\mathbf{x})\\).\n\n\n\n\n\n\nDo not be confused by the fact that we use \\(p\\) for two different things: the conditional probability \\(p(\\mathbf{x})\\) and the number of predictors \\(p\\).\n\n\n\nThese probabilities guide the construction of an algorithm that makes the best prediction: for any given \\(\\mathbf{x}\\), we will predict the class \\(k\\) with the largest probability among \\(p_1(x), p_2(x), \\dots p_K(x)\\). In mathematical notation, we write it like this:\n\\[\\hat{Y} = \\max_k p_k(\\mathbf{x})\\]\nIn machine learning, we refer to this as Bayes’ Rule. But this is a theoretical rule since, in practice, we don’t know \\(p_k(\\mathbf{x}), k=1,\\dots,K\\). In fact, estimating these conditional probabilities can be thought of as the main challenge of machine learning. The better our probability estimates \\(\\hat{p}_k(\\mathbf{x})\\), the better our predictor \\(\\hat{Y}\\).\nSo how well we predict depends on two things: 1) how close are the \\(\\max_k p_k(\\mathbf{x})\\) to 1 or 0 (perfect certainty) and 2) how close our estimates \\(\\hat{p}_k(\\mathbf{x})\\) are to \\(p_k(\\mathbf{x})\\). We can’t do anything about the first restriction as it is determined by the nature of the problem, so our energy goes into finding ways to best estimate conditional probabilities.\nThe first restriction does imply that we have limits as to how well even the best possible algorithm can perform. You should get used to the idea that while in some challenges we will be able to achieve almost perfect accuracy, with digit readers for example, in others, our success is restricted by the randomness of the process, such as with movie recommendations.\nKeep in mind that defining our prediction by maximizing the probability is not always optimal in practice and depends on the context. As discussed in Chapter 26, sensitivity and specificity may differ in importance. But even in these cases, having a good estimate of the \\(p_k(x), k=1,\\dots,K\\) will suffice for us to build optimal prediction models, since we can control the balance between specificity and sensitivity however we wish. For instance, we can simply change the cutoffs used to predict one outcome or the other. In the plane example, we may ground the plane anytime the probability of malfunction is higher than 1 in a million as opposed to the default 1/2 used when error types are equally undesired."
   },
   {
     "objectID": "ml/conditionals.html#conditional-expectations",
     "href": "ml/conditionals.html#conditional-expectations",
-    "title": "\n26  Conditional probabilities and expectations\n",
-    "section": "\n26.2 Conditional expectations",
-    "text": "26.2 Conditional expectations\nFor binary data, you can think of the probability \\(\\mbox{Pr}(Y=1 \\mid \\mathbf{X}=\\mathbf{x})\\) as the proportion of 1s in the stratum of the population for which \\(\\mathbf{X}=\\mathbf{x}\\). Many of the algorithms we will learn can be applied to both categorical and continuous data due to the connection between conditional probabilities and conditional expectations.\nBecause the expectation is the average of values \\(y_1,\\dots,y_n\\) in the population, in the case in which the \\(y\\)s are 0 or 1, the expectation is equivalent to the probability of randomly picking a one since the average is simply the proportion of ones:\n\\[\n\\mbox{E}(Y \\mid \\mathbf{X}=\\mathbf{x})=\\mbox{Pr}(Y=1 \\mid \\mathbf{X}=\\mathbf{x}).\n\\]\nAs a result, we often only use the expectation to denote both the conditional probability and conditional expectation.\nJust like with categorical outcomes, in most applications the same observed predictors do not guarantee the same continuous outcomes. Instead, we assume that the outcome follows the same conditional distribution. We will now explain why we use the conditional expectation to define our predictors."
+    "title": "\n27  Conditional probabilities and expectations\n",
+    "section": "\n27.2 Conditional expectations",
+    "text": "27.2 Conditional expectations\nFor binary data, you can think of the probability \\(\\mbox{Pr}(Y=1 \\mid \\mathbf{X}=\\mathbf{x})\\) as the proportion of 1s in the stratum of the population for which \\(\\mathbf{X}=\\mathbf{x}\\). Many of the algorithms we will learn can be applied to both categorical and continuous data due to the connection between conditional probabilities and conditional expectations.\nBecause the expectation is the average of values \\(y_1,\\dots,y_n\\) in the population, in the case in which the \\(y\\)s are 0 or 1, the expectation is equivalent to the probability of randomly picking a one since the average is simply the proportion of ones:\n\\[\n\\mbox{E}(Y \\mid \\mathbf{X}=\\mathbf{x})=\\mbox{Pr}(Y=1 \\mid \\mathbf{X}=\\mathbf{x}).\n\\]\nAs a result, we often only use the expectation to denote both the conditional probability and conditional expectation.\nJust like with categorical outcomes, in most applications the same observed predictors do not guarantee the same continuous outcomes. Instead, we assume that the outcome follows the same conditional distribution. We will now explain why we use the conditional expectation to define our predictors."
   },
   {
-    "objectID": "ml/conditionals.html#conditional-expectation-minimizes-squared-loss-function",
-    "href": "ml/conditionals.html#conditional-expectation-minimizes-squared-loss-function",
-    "title": "\n26  Conditional probabilities and expectations\n",
-    "section": "\n26.3 Conditional expectation minimizes squared loss function",
-    "text": "26.3 Conditional expectation minimizes squared loss function\nWhy do we care about the conditional expectation in machine learning? This is because the expected value has an attractive mathematical property: it minimizes the MSE. Specifically, of all possible predictions \\(\\hat{Y}\\),\n\\[\n\\hat{Y} = \\mbox{E}(Y \\mid \\mathbf{X}=\\mathbf{x}) \\, \\mbox{ minimizes } \\, \\mbox{E}\\{ (\\hat{Y} - Y)^2  \\mid  \\mathbf{X}=\\mathbf{x} \\}\n\\]\nDue to this property, a succinct description of the main task of machine learning is that we use data to estimate:\n\\[\nf(\\mathbf{x}) \\equiv \\mbox{E}( Y  \\mid  \\mathbf{X}=\\mathbf{x} )\n\\]\nfor any set of features \\(\\mathbf{x} = (x_1, \\dots, x_p)\\). Of course this is easier said than done, since this function can take any shape and \\(p\\) can be very large. Consider a case in which we only have one predictor \\(x\\). The expectation \\(\\mbox{E}\\{ Y \\mid X=x \\}\\) can be any function of \\(x\\): a line, a parabola, a sine wave, a step function, anything. It gets even more complicated when we consider instances with large \\(p\\), in which case \\(f(\\mathbf{x})\\) is a function of a multidimensional vector \\(\\mathbf{x}\\). For example, in our digit reader example \\(p = 784\\)! The main way in which competing machine learning algorithms differ is in their approach to estimating this expectation."
+    "objectID": "ml/conditionals.html#conditional-expectations-minimizes-squared-loss-function",
+    "href": "ml/conditionals.html#conditional-expectations-minimizes-squared-loss-function",
+    "title": "\n27  Conditional probabilities and expectations\n",
+    "section": "\n27.3 Conditional expectations minimizes squared loss function",
+    "text": "27.3 Conditional expectations minimizes squared loss function\nWhy do we care about the conditional expectation in machine learning? This is because the expected value has an attractive mathematical property: it minimizes the MSE. Specifically, of all possible predictions \\(\\hat{Y}\\),\n\\[\n\\hat{Y} = \\mbox{E}(Y \\mid \\mathbf{X}=\\mathbf{x}) \\, \\mbox{ minimizes } \\, \\mbox{E}\\{ (\\hat{Y} - Y)^2  \\mid  \\mathbf{X}=\\mathbf{x} \\}\n\\]\nDue to this property, a succinct description of the main task of machine learning is that we use data to estimate:\n\\[\nf(\\mathbf{x}) \\equiv \\mbox{E}( Y  \\mid  \\mathbf{X}=\\mathbf{x} )\n\\]\nfor any set of features \\(\\mathbf{x} = (x_1, \\dots, x_p)^\\top\\).\nThis is easier said than done, since this function can take any shape and \\(p\\) can be very large. Consider a case in which we only have one predictor \\(x\\). The expectation \\(\\mbox{E}\\{ Y \\mid X=x \\}\\) can be any function of \\(x\\): a line, a parabola, a sine wave, a step function, anything. It gets even more complicated when we consider instances with large \\(p\\), in which case \\(f(\\mathbf{x})\\) is a function of a multidimensional vector \\(\\mathbf{x}\\). For example, in our digit reader example \\(p = 784\\)!\nThe main way in which competing machine learning algorithms differ is in their approach to estimating this conditional expectation."
   },
   {
     "objectID": "ml/conditionals.html#exercises",
     "href": "ml/conditionals.html#exercises",
-    "title": "\n26  Conditional probabilities and expectations\n",
-    "section": "\n26.4 Exercises",
-    "text": "26.4 Exercises\n1. Compute conditional probabilities for being Male for the heights dataset. Round the heights to the closest inch. Plot the estimated conditional probability \\(P(x) = \\mbox{Pr}(\\mbox{Male} | \\mbox{height}=x)\\) for each \\(x\\).\n2. In the plot we just made, we see high variability for low values of height. This is because we have few data points in these strata. This time use the quantile function for quantiles \\(0.1,0.2,\\dots,0.9\\) and the cut function to assure each group has the same number of points. Hint: for any numeric vector x, you can create groups based on quantiles like this:\n\ncut(x, quantile(x, seq(0, 1, 0.1)), include.lowest = TRUE)\n\n3. Generate data from a bivariate normal distribution using the MASS package like this:\n\nSigma &lt;- 9*matrix(c(1,0.5,0.5,1), 2, 2)\ndat &lt;- MASS::mvrnorm(n = 10000, c(69, 69), Sigma) |&gt;\n  data.frame() |&gt; setNames(c(\"x\", \"y\"))\n\nYou can make a quick plot of the data using plot(dat). Use an approach similar to the previous exercise to estimate the conditional expectations and make a plot."
+    "title": "\n27  Conditional probabilities and expectations\n",
+    "section": "\n27.4 Exercises",
+    "text": "27.4 Exercises\n1. Compute conditional probabilities for being Male for the heights dataset. Round the heights to the closest inch. Plot the estimated conditional probability \\(P(x) = \\mbox{Pr}(\\mbox{Male} | \\mbox{height}=x)\\) for each \\(x\\).\n2. In the plot we just made, we see high variability for low values of height. This is because we have few data points in these strata. This time use the quantile function for quantiles \\(0.1,0.2,\\dots,0.9\\) and the cut function to assure each group has the same number of points. Hint: For any numeric vector x, you can create groups based on quantiles as we demonstrate below.\n\ncut(x, quantile(x, seq(0, 1, 0.1)), include.lowest = TRUE)\n\n3. Generate data from a bivariate normal distribution using the MASS package like this:\n\nSigma &lt;- 9*matrix(c(1,0.5,0.5,1), 2, 2)\ndat &lt;- MASS::mvrnorm(n = 10000, c(69, 69), Sigma) |&gt;\n  data.frame() |&gt; setNames(c(\"x\", \"y\"))\n\nYou can make a quick plot of the data using plot(dat). Use an approach similar to the previous exercise to estimate the conditional expectations and make a plot."
   },
   {
     "objectID": "ml/smoothing.html#sec-two-or-seven",
     "href": "ml/smoothing.html#sec-two-or-seven",
-    "title": "\n27  Smoothing\n",
-    "section": "\n27.1 Simplified MNIST: Is it a 2 or a 7?",
-    "text": "27.1 Simplified MNIST: Is it a 2 or a 7?\nTo motivate the need for smoothing and make the connection with machine learning, we will construct a simplifyed version of the MNIST dataset with just two classes for the outcome and two predictors. Specifically, we define the challenge as building an algorithm that can determine if a digit is a 2 or 7 from the proportion of dark pixels in the upper left quadrant (\\(X_1\\)) and the lower right quadrant (\\(X_2\\)). We also selected a random sample of 1,000 digits, 500 in the training set and 500 in the test set. We provide this dataset in the dslabs package:\n\nlibrary(tidyverse)\nlibrary(caret)\nlibrary(dslabs)\nmnist_27$train |&gt; ggplot(aes(x_1, x_2, color = y)) + geom_point()\n\n\n\n\n\n\n\nWe can immediately see some patterns. For example, if \\(X_1\\) (the upper left panel) is very large, then the digit is probably a 7. Also, for smaller values of \\(X_1\\), the 2s appear to be in the mid range values of \\(X_2\\).\nTo illustrate how to interpret \\(X_1\\) and \\(X_2\\), we include four example images. On the left are the original images of the two digits with the largest and smallest values for \\(X_1\\) and on the right we have the images corresponding to the largest and smallest values of \\(X_2\\):\n\n\n\n\n\n\n\n\nWe can start getting a sense for why these predictors are useful, but also why the problem will be somewhat challenging.\nWe haven’t really learned any algorithms yet, so let’s try building an algorithm using multivariable regression. The model is simply:\n\\[\np(x_1, x_2) = \\mbox{Pr}(Y=1 \\mid X_1=x_1 , X_2 = x_2) =\n\\beta_0 + \\beta_1 x_1 + \\beta_2 x_2\n\\]\nWe fit it like this:\n\nfit &lt;- mnist_27$train |&gt; mutate(y = ifelse(y == 7, 1, 0)) |&gt; lm(y ~ x_1 + x_2, data = _)\n\nWe can now build a decision rule based on the estimate of \\(\\hat{p}(x_1, x_2)\\):\n\np_hat &lt;- predict(fit, newdata = mnist_27$test)\ny_hat &lt;- factor(ifelse(p_hat &gt; 0.5, 7, 2))\nconfusionMatrix(y_hat, mnist_27$test$y)$overall[[\"Accuracy\"]]\n#&gt; [1] 0.75\n\nWe get an accuracy well above 50%. Not bad for our first try. But can we do better?\nBecause we constructed the mnist_27 example and we had at our disposal 60,000 digits in just the MNIST dataset, we used this to build the true conditional distribution \\(p(x_1, x_2)\\). Keep in mind that this is something we don’t have access to in practice, but we include it in this example because it permits the comparison of \\(\\hat{p}(x_1, x_2)\\) to the true \\(p(x_1, x_2)\\). This comparison teaches us the limitations of different algorithms. Let’s do that here. We have stored the true \\(p(x_1,x_2)\\) in the mnist_27 object and can plot the image using the ggplot2 function geom_raster(). We choose better colors and use the stat_contour function to draw a curve that separates pairs \\((x_1,x_2)\\) for which \\(p(x_1,x_2) &gt; 0.5\\) and pairs for which \\(p(x_1,x_2) &lt; 0.5\\):\n\nmnist_27$true_p |&gt; \n  ggplot(aes(x_1, x_2, z = p)) +\n  geom_raster(aes(fill = p)) +\n  scale_fill_gradientn(colors = c(\"#F8766D\", \"white\", \"#00BFC4\")) +\n  stat_contour(breaks = c(0.5), color = \"black\")\n\n\n\n\n\n\n\nAbove you see a plot of the true \\(p(x_1, x_2)\\). To start understanding the limitations of regression here, first note that with regression \\(\\hat{p}(x_1,x_2)\\) has to be a plane, and as a result the boundary defined by the decision rule is given by: \\(\\hat{p}(x_1,x_2) = 0.5\\):\n\\[\n\\hat{\\beta}_0 + \\hat{\\beta}_1 x_1 + \\hat{\\beta}_2 x_2 = 0.5 \\implies\n\\hat{\\beta}_0 + \\hat{\\beta}_1 x_1 + \\hat{\\beta}_2 x_2 = 0.5  \\implies\nx_2 = (0.5-\\hat{\\beta}_0)/\\hat{\\beta}_2  -\\hat{\\beta}_1/\\hat{\\beta}_2 x_1\n\\]\nNote that for this boundary, \\(x_2\\) is a linear function of \\(x_1\\). This implies that our regression approach has no chance of capturing the non-linear nature of the true \\(p(x_1,x_2)\\). Below is a visual representation of \\(\\hat{p}(x_1, x_2)\\). Regression can’t catch this.\n\n\n\n\n\n\n\n\nWe need something more flexible: a method that permits estimates with shapes other than a plane. Smoothing techniques permit this flexibility. We will start by describing nearest neighbor and kernel approaches. To understand why we cover this topic, remember that the concepts behind smoothing techniques are extremely useful in machine learning because conditional expectations/probabilities can be thought of as trends of unknown shapes that we need to estimate in the presence of uncertainty."
+    "title": "\n28  Smoothing\n",
+    "section": "\n28.1 Example: Is it a 2 or a 7?",
+    "text": "28.1 Example: Is it a 2 or a 7?\nTo motivate the need for smoothing and make the connection with machine learning, we will construct a simplified version of the MNIST dataset with just two classes for the outcome and two predictors. Specifically, we define the challenge as building an algorithm that can determine if a digit is a 2 or 7 from the proportion of dark pixels in the upper left quadrant (\\(X_1\\)) and the lower right quadrant (\\(X_2\\)). We also selected a random sample of 1,000 digits, 500 in the training set and 500 in the test set. We provide this dataset in the mnist_27 object in the dslabs package.\nFor the training data, we have \\(n=500\\) observed outcomes \\(y_1,\\dots,y_n\\), with \\(Y\\) defined as \\(1\\) if the digit is 7 and 0 if it’s 2, and \\(n=500\\) features \\(\\mathbf{x}_1, \\dots, \\mathbf{x}_n\\), with each feature a two-dimensional point \\(\\mathbf{x}_i = (x_{i,1}, x_{i,2})^\\top\\). Here is a plot of the \\(x_2\\)s versus the \\(x_1\\)s with color determining if \\(y\\) is 1 (blue) or 0 (red):\n\nlibrary(caret)\nlibrary(dslabs)\nmnist_27$train |&gt; ggplot(aes(x_1, x_2, color = y)) + geom_point()\n\n\n\n\n\n\n\nWe can immediately see some patterns. For example, if \\(x_1\\) (the upper left panel) is very large, then the digit is probably a 7. Also, for smaller values of \\(x_1\\), the 2s appear to be in the mid range values of \\(x_2\\).\nTo illustrate how to interpret \\(x_1\\) and \\(x_2\\), we include four example images. On the left are the original images of the two digits with the largest and smallest values for \\(x_1\\) and on the right we have the images corresponding to the largest and smallest values of \\(x_2\\):\n\n\n\n\n\n\n\n\nWe can start getting a sense for why these predictors are useful, but also why the problem will be somewhat challenging.\nWe haven’t really learned any algorithms yet, so let’s try building an algorithm using multivariable regression. The model is simply:\n\\[\np(\\mathbf{x}) = \\mbox{Pr}(Y=1 \\mid \\mathbf{X}=\\mathbf{x}) = \\mbox{Pr}(Y=1 \\mid X_1=x_1 , X_2 = x_2) =\n\\beta_0 + \\beta_1 x_1 + \\beta_2 x_2\n\\]\nWe fit can fit this model using least squares and obtain an estimate \\(\\hat{p}(\\mathbf{x})\\) by using the least square estimates \\(\\hat{\\beta}_0\\), \\(\\hat{\\beta}_1\\) and \\(\\hat{\\beta}_2\\). We define a decision rule by predicting \\(\\hat{y}=1\\) if \\(\\hat{p}(\\mathbf{x})&gt;0.5\\) and 0 otherwise.\nWe get an accuracy of 0.775, well above 50%. Not bad for our first try. But can we do better?\nBecause we constructed the mnist_27 example and we had at our disposal 60,000 digits in just the MNIST dataset, we used this to build the true conditional distribution \\(p(\\mathbf{x})\\). Keep in mind that in practice we don’t have access to the true conditional distribution. We include it in this educational example because it permits the comparison of \\(\\hat{p}(\\mathbf{x})\\) to the true \\(p(\\mathbf{x})\\). This comparison teaches us the limitations of different algorithms. We have stored the true \\(p(\\mathbf{x})\\) in the mnist_27 and can plot it as an image. We draw a curve that separates pairs \\((\\mathbf{x})\\) for which \\(p(\\mathbf{x}) &gt; 0.5\\) and pairs for which \\(p(\\mathbf{x}) &lt; 0.5\\):\n\n\n\n\n\n\n\n\nTo start understanding the limitations of regression, first note that with regression \\(\\hat{p}(\\mathbf{x})\\) has to be a plane, and as a result the boundary defined by the decision rule is given by: \\(\\hat{p}(\\mathbf{x}) = 0.5\\):\n\\[\n\\hat{\\beta}_0 + \\hat{\\beta}_1 x_1 + \\hat{\\beta}_2 x_2 = 0.5 \\implies\n\\hat{\\beta}_0 + \\hat{\\beta}_1 x_1 + \\hat{\\beta}_2 x_2 = 0.5  \\implies\nx_2 = (0.5-\\hat{\\beta}_0)/\\hat{\\beta}_2  -\\hat{\\beta}_1/\\hat{\\beta}_2 x_1\n\\]\nThis implies that for the boundary, \\(x_2\\) is a linear function of \\(x_1\\), which suggests that our regression approach has no chance of capturing the non-linear nature of the true \\(p(\\mathbf{x})\\). Below is a visual representation of \\(\\hat{p}(\\mathbf{x})\\) which clearly shows how it fails to capture the shape of \\(p(\\mathbf{x})\\):\n\n\n\n\n\n\n\n\nWe need something more flexible: a method that permits estimates with shapes other than a plane. Smoothing techniques permit this flexibility. We will start by describing nearest neighbor and kernel approaches. To understand why we cover this topic, remember that the concepts behind smoothing techniques are extremely useful in machine learning because conditional expectations/probabilities can be thought of as trends of unknown shapes that we need to estimate in the presence of uncertainty."
   },
   {
     "objectID": "ml/smoothing.html#signal-plus-noise-model",
     "href": "ml/smoothing.html#signal-plus-noise-model",
-    "title": "\n27  Smoothing\n",
-    "section": "\n27.2 Signal plus noise model",
-    "text": "27.2 Signal plus noise model\nTo explain these concepts, we will focus first on a problem with just one predictor. Specifically, we try to estimate the time trend in the 2008 US popular vote poll margin (difference between Obama and McCain). Later we will learn about methods such as k-nearest neighbors that can be used to smooth with higher dimensions.\n\nlibrary(tidyverse)\nlibrary(dslabs)\npolls_2008 |&gt; ggplot(aes(day, margin)) + geom_point()\n\n\n\n\n\n\n\nFor the purposes of the popular vote example, do not think of it as a forecasting problem. Instead, we are simply interested in learning the shape of the trend after the election is over.\nWe assume that for any given day \\(x\\), there is a true preference among the electorate \\(f(x)\\), but due to the uncertainty introduced by the polling, each data point comes with an error \\(\\varepsilon\\). A mathematical model for the observed poll margin \\(Y_i\\) is:\n\\[\nY_i = f(x_i) + \\varepsilon_i\n\\]\nTo think of this as a machine learning problem, consider that we want to predict \\(Y\\) given a day \\(x\\). If we knew the conditional expectation \\(f(x) = \\mbox{E}(Y \\mid X=x)\\), we would use it. But since we don’t know this conditional expectation, we have to estimate it. Let’s use regression, since it is the only method we have learned up to now.\n\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nThe line we see does not appear to describe the trend very well. For example, on September 4 (day -62), the Republican Convention was held and the data suggest that it gave John McCain a boost in the polls. However, the regression line does not capture this potential trend. To see the lack of fit more clearly, we note that points above the fitted line (blue) and those below (red) are not evenly distributed across days. We therefore need an alternative, more flexible approach."
+    "title": "\n28  Smoothing\n",
+    "section": "\n28.2 Signal plus noise model",
+    "text": "28.2 Signal plus noise model\nTo explain these concepts, we will focus first on a problem with just one predictor. Specifically, we try to estimate the time trend in the 2008 US popular vote poll margin (the difference between Obama and McCain). Later we will learn about methods, such as k-nearest neighbors, that can be used to smooth with higher dimensions.\n\npolls_2008 |&gt; ggplot(aes(day, margin)) + geom_point()\n\n\n\n\n\n\n\nFor the purposes of the popular vote example, do not think of it as a forecasting problem. Instead, we are simply interested in learning the shape of the trend after the election is over.\nWe assume that for any given day \\(x\\), there is a true preference among the electorate \\(f(x)\\), but due to the uncertainty introduced by the polling, each data point comes with an error \\(\\varepsilon\\). A mathematical model for the observed poll margin \\(Y_i\\) is:\n\\[\nY_i = f(x_i) + \\varepsilon_i\n\\]\nTo think of this as a machine learning problem, consider that we want to predict \\(Y\\) given a day \\(x\\). If we knew the conditional expectation \\(f(x) = \\mbox{E}(Y \\mid X=x)\\), we would use it. But since we don’t know this conditional expectation, we have to estimate it. Let’s use regression, since it is the only method we have learned up to now.\n\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nThe fitted regression line does not appear to describe the trend very well. For example, on September 4 (day -62), the Republican Convention was held and the data suggest that it gave John McCain a boost in the polls. However, the regression line does not capture this potential trend. To see the lack of fit more clearly, we note that points above the fitted line (blue) and those below (red) are not evenly distributed across days. We therefore need an alternative, more flexible approach."
   },
   {
     "objectID": "ml/smoothing.html#bin-smoothing",
     "href": "ml/smoothing.html#bin-smoothing",
-    "title": "\n27  Smoothing\n",
-    "section": "\n27.3 Bin smoothing",
-    "text": "27.3 Bin smoothing\nThe general idea of smoothing is to group data points into strata in which the value of \\(f(x)\\) can be assumed to be constant. We can make this assumption because we think \\(f(x)\\) changes slowly and, as a result, \\(f(x)\\) is almost constant in small windows of time. An example of this idea for the poll_2008 data is to assume that public opinion remained approximately the same within a week’s time. With this assumption in place, we have several data points with the same expected value.\nIf we fix a day to be in the center of our week, call it \\(x_0\\), then for any other day \\(x\\) such that \\(|x - x_0| \\leq 3.5\\), we assume \\(f(x)\\) is a constant \\(f(x) = \\mu\\). This assumption implies that:\n\\[\nE[Y_i | X_i = x_i ] \\approx \\mu \\mbox{   if   }  |x_i - x_0| \\leq 3.5\n\\]\nIn smoothing, we call the size of the interval satisfying \\(|x_i - x_0| \\leq 3.5\\) the window size, bandwidth or span. Later we will see that we try to optimize this parameter.\nThis assumption implies that a good estimate for \\(f(x)\\) is the average of the \\(Y_i\\) values in the window. If we define \\(A_0\\) as the set of indexes \\(i\\) such that \\(|x_i - x_0| \\leq 3.5\\) and \\(N_0\\) as the number of indexes in \\(A_0\\), then our estimate is:\n\\[\n\\hat{f}(x_0) = \\frac{1}{N_0} \\sum_{i \\in A_0}  Y_i\n\\]\nThe idea behind bin smoothing is to make this calculation with each value of \\(x\\) as the center. In the poll example, for each day, we would compute the average of the values within a week with that day in the center. Here are two examples: \\(x_0 = -125\\) and \\(x_0 = -55\\). The blue segment represents the resulting average.\n\n\n\n\n\n\n\n\nBy computing this mean for every point, we form an estimate of the underlying curve \\(f(x)\\). Below we show the procedure happening as we move from the -155 up to 0. At each value of \\(x_0\\), we keep the estimate \\(\\hat{f}(x_0)\\) and move on to the next point:\n\n\n\n\n\n\n\n\nThe final code and resulting estimate look like this:\n\nspan &lt;- 7 \nfit &lt;- with(polls_2008, ksmooth(day, margin, kernel = \"box\", bandwidth = span))\n\npolls_2008 |&gt; mutate(smooth = fit$y) |&gt;\n  ggplot(aes(day, margin)) +\n    geom_point(size = 3, alpha = .5, color = \"grey\") + \n  geom_line(aes(day, smooth), color = \"red\")"
+    "title": "\n28  Smoothing\n",
+    "section": "\n28.3 Bin smoothing",
+    "text": "28.3 Bin smoothing\nThe general idea of smoothing is to group data points into strata in which the value of \\(f(x)\\) can be assumed to be constant. We can make this assumption when we think \\(f(x)\\) changes slowly and, as a result, \\(f(x)\\) is almost constant in small windows of \\(x\\). An example of this idea for the poll_2008 data is to assume that public opinion remained approximately the same within a week’s time. With this assumption in place, we have several data points with the same expected value.\nIf we fix a day to be in the center of our week, call it \\(x_0\\), then for any other day \\(x\\) such that \\(|x - x_0| \\leq 3.5\\), we assume \\(f(x)\\) is a constant \\(f(x) = \\mu\\). This assumption implies that:\n\\[\nE[Y_i | X_i = x_i ] \\approx \\mu \\mbox{   if   }  |x_i - x_0| \\leq 3.5\n\\]\nIn smoothing, we call the size of the interval satisfying \\(|x_i - x_0| \\leq 3.5\\) the window size, bandwidth or span. Later we will see that we try to optimize this parameter.\nThis assumption implies that a good estimate for \\(f(x_0)\\) is the average of the \\(Y_i\\) values in the window. If we define \\(A_0\\) as the set of indexes \\(i\\) such that \\(|x_i - x_0| \\leq 3.5\\) and \\(N_0\\) as the number of indexes in \\(A_0\\), then our estimate is:\n\\[\n\\hat{f}(x_0) = \\frac{1}{N_0} \\sum_{i \\in A_0}  Y_i\n\\]\nWe make this calculation with each value of \\(x\\) as the center. In the poll example, for each day, we would compute the average of the values within a week with that day in the center. Here are two examples: \\(x_0 = -125\\) and \\(x_0 = -55\\). The blue segment represents the resulting average.\n\n\n\n\n\n\n\n\nBy computing this mean for every point, we form an estimate of the underlying curve \\(f(x)\\). Below we show the procedure happening as we move from the -155 up to 0. At each value of \\(x_0\\), we keep the estimate \\(\\hat{f}(x_0)\\) and move on to the next point:\n\n\n\n\n\n\n\n\nThe final code and resulting estimate look like this:\n\nspan &lt;- 7 \nfit &lt;- with(polls_2008, ksmooth(day, margin, kernel = \"box\", bandwidth = span))\n\npolls_2008 |&gt; mutate(fit = fit$y) |&gt;\n  ggplot(aes(x = day)) +\n  geom_point(aes(y = margin), size = 3, alpha = .5, color = \"grey\") + \n  geom_line(aes(y = fit), color = \"red\")"
   },
   {
     "objectID": "ml/smoothing.html#kernels",
     "href": "ml/smoothing.html#kernels",
-    "title": "\n27  Smoothing\n",
-    "section": "\n27.4 Kernels",
-    "text": "27.4 Kernels\nThe final result from the bin smoother is quite wiggly. One reason for this is that each time the window moves, two points change. We can attenuate this somewhat by taking weighted averages that give the center point more weight than far away points, with the two points at the edges receiving very little weight.\nYou can think of the bin smoother approach as a weighted average:\n\\[\n\\hat{f}(x_0) = \\sum_{i=1}^N w_0(x_i) Y_i\n\\]\nin which each point receives a weight of either \\(0\\) or \\(1/N_0\\), with \\(N_0\\) the number of points in the week. In the code above, we used the argument kernel=\"box\" in our call to the function ksmooth. This is because the weight function looks like a box. The ksmooth function provides a “smoother” option which uses the normal density to assign weights.\n\n\n\n\n\n\n\n\n\nThe final code and resulting plot for the normal kernel look like this:\n\nspan &lt;- 7\nfit &lt;- with(polls_2008, ksmooth(day, margin, kernel = \"normal\", bandwidth = span))\n\npolls_2008 |&gt; mutate(smooth = fit$y) |&gt;\n  ggplot(aes(day, margin)) +\n  geom_point(size = 3, alpha = .5, color = \"grey\") + \n  geom_line(aes(day, smooth), color = \"red\")\n\n\n\n\n\n\n\nNotice that the final estimate now looks smoother.\nThere are several functions in R that implement bin smoothers. One example is ksmooth, shown above. In practice, however, we typically prefer methods that use slightly more complex models than fitting a constant. The final result above, for example, is still somewhat wiggly in parts we don’t expect it to be (between -125 and -75, for example). Methods such as loess, which we explain next, improve on this."
+    "title": "\n28  Smoothing\n",
+    "section": "\n28.4 Kernels",
+    "text": "28.4 Kernels\nThe final result from the bin smoother is quite wiggly. One reason for this is that each time the window moves, two points change. We can attenuate this somewhat by taking weighted averages that give the center point more weight than far away points, with the two points at the edges receiving very little weight.\nYou can think of the bin smoother approach as a weighted average:\n\\[\n\\hat{f}(x_0) = \\sum_{i=1}^N w_0(x_i) Y_i\n\\]\nin which each point receives a weight of either \\(0\\) or \\(1/N_0\\), with \\(N_0\\) the number of points in the week. In the code above, we used the argument kernel=\"box\" in our call to the function ksmooth. This is because the weight function looks like a box. The ksmooth function provides a “smoother” option which uses the normal density to assign weights.\n\n\n\n\n\n\n\n\nThe final code and resulting plot for the normal kernel look like this:\n\nspan &lt;- 7\nfit &lt;- with(polls_2008, ksmooth(day, margin, kernel = \"normal\", bandwidth = span))\n\npolls_2008 |&gt; mutate(smooth = fit$y) |&gt;\n  ggplot(aes(day, margin)) +\n  geom_point(size = 3, alpha = .5, color = \"grey\") + \n  geom_line(aes(day, smooth), color = \"red\")\n\n\n\n\n\n\n\nNotice that this version looks smoother.\nThere are several functions in R that implement bin smoothers. One example is ksmooth, shown above. In practice, however, we typically prefer methods that use slightly more complex models than fitting a constant. The final result above, for example, is still somewhat wiggly in parts we don’t expect it to be (between -125 and -75, for example). Methods such as loess, which we explain next, improve on this."
   },
   {
     "objectID": "ml/smoothing.html#local-weighted-regression-loess",
     "href": "ml/smoothing.html#local-weighted-regression-loess",
-    "title": "\n27  Smoothing\n",
-    "section": "\n27.5 Local weighted regression (loess)",
-    "text": "27.5 Local weighted regression (loess)\nA limitation of the bin smoother approach just described is that we need small windows for the approximately constant assumptions to hold. As a result, we end up with a small number of data points to average and obtain imprecise estimates \\(\\hat{f}(x)\\). Here we describe how local weighted regression (loess) permits us to consider larger window sizes. To do this, we will use a mathematical result, referred to as Taylor’s theorem, which tells us that if you look closely enough at any smooth function \\(f(x)\\), it will look like a line. To see why this makes sense, consider the curved edges gardeners make using straight-edged spades:\n\n\n\n\n\n\n\n\n(“Downing Street garden path edge”1 by Flckr user Number 102. CC-BY 2.0 license3.)\nInstead of assuming the function is approximately constant in a window, we assume the function is locally linear. We can consider larger window sizes with the linear assumption than with a constant. Instead of the one-week window, we consider a larger one in which the trend is approximately linear. We start with a three-week window and later consider and evaluate other options:\n\\[\nE[Y_i | X_i = x_i ] = \\beta_0 + \\beta_1 (x_i-x_0) \\mbox{   if   }  |x_i - x_0| \\leq 21\n\\]\nFor every point \\(x_0\\), loess defines a window and fits a line within that window. Here is an example showing the fits for \\(x_0=-125\\) and \\(x_0 = -55\\):\n\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nThe fitted value at \\(x_0\\) becomes our estimate \\(\\hat{f}(x_0)\\). Below we show the procedure happening as we move from the -155 up to 0.\n\n\n\n\n\n\n\n\nThe final result is a smoother fit than the bin smoother since we use larger sample sizes to estimate our local parameters:\n\ntotal_days &lt;- diff(range(polls_2008$day))\nspan &lt;- 21/total_days\nfit &lt;- loess(margin ~ day, degree = 1, span = span, data = polls_2008)\npolls_2008 |&gt; mutate(smooth = fit$fitted) |&gt;\n  ggplot(aes(day, margin)) +\n  geom_point(size = 3, alpha = .5, color = \"grey\") +\n  geom_line(aes(day, smooth), color = \"red\")\n\n\n\n\n\n\n\nDifferent spans give us different estimates. We can see how different window sizes lead to different estimates:\n\n\n\n\n\n\n\n\nHere are the final estimates:\n\n\n\n\n\n\n\n\nThere are three other differences between loess and the typical bin smoother.\n1. Rather than keeping the bin size the same, loess keeps the number of points used in the local fit the same. This number is controlled via the span argument, which expects a proportion. For example, if N is the number of data points and span=0.5, then for a given \\(x\\), loess will use the 0.5 * N closest points to \\(x\\) for the fit.\n2. When fitting a line locally, loess uses a weighted approach. Basically, instead of using least squares, we minimize a weighted version:\n\\[\n\\sum_{i=1}^N w_0(x_i) \\left[Y_i - \\left\\{\\beta_0 + \\beta_1 (x_i-x_0)\\right\\}\\right]^2\n\\]\nHowever, instead of the Gaussian kernel, loess uses a function called the Tukey tri-weight:\n\\[\nW(u)= \\left( 1  - |u|^3\\right)^3 \\mbox{ if } |u| \\leq 1 \\mbox{ and } W(u) = 0 \\mbox{ if } |u| &gt; 1\n\\]\nTo define the weights, we denote \\(2h\\) as the window size and define:\n\\[\nw_0(x_i) = W\\left(\\frac{x_i - x_0}{h}\\right)\n\\]\nThis kernel differs from the Gaussian kernel in that more points get values closer to the max:\n\n\n\n\n\n\n\n\n3. loess has the option of fitting the local model robustly. An iterative algorithm is implemented in which, after fitting a model in one iteration, outliers are detected and down-weighted for the next iteration. To use this option, we use the argument family=\"symmetric\".\n\n27.5.1 Fitting parabolas\nTaylor’s theorem also tells us that if you look at any mathematical function closely enough, it looks like a parabola. The theorem also states that you don’t have to look as closely when approximating with parabolas as you do when approximating with lines. This means we can make our windows even larger and fit parabolas instead of lines.\n\\[\nE[Y_i | X_i = x_i ] = \\beta_0 + \\beta_1 (x_i-x_0) + \\beta_2 (x_i-x_0)^2 \\mbox{   if   }  |x_i - x_0| \\leq h\n\\]\nThis is actually the default procedure of the function loess. You may have noticed that when we showed the code for using loess, we set degree = 1. This tells loess to fit polynomials of degree 1, a fancy name for lines. If you read the help page for loess, you will see that the argument degree defaults to 2. By default, loess fits parabolas not lines. Here is a comparison of the fitting lines (red dashed) and fitting parabolas (orange solid):\n\ntotal_days &lt;- diff(range(polls_2008$day))\nspan &lt;- 28/total_days\nfit_1 &lt;- loess(margin ~ day, degree = 1, span = span, data = polls_2008)\nfit_2 &lt;- loess(margin ~ day, span = span, data = polls_2008)\n\npolls_2008 |&gt; mutate(smooth_1 = fit_1$fitted, smooth_2 = fit_2$fitted) |&gt;\n  ggplot(aes(day, margin)) +\n  geom_point(size = 3, alpha = .5, color = \"grey\") +\n  geom_line(aes(day, smooth_1), color = \"red\", lty = 2) +\n  geom_line(aes(day, smooth_2), color = \"orange\", lty = 1) \n\n\n\n\n\n\n\nThe degree = 2 gives us more wiggly results. We actually prefer degree = 1 as it is less prone to this kind of noise.\n\n27.5.2 Beware of default smoothing parameters\nggplot uses loess in its geom_smooth function:\n\npolls_2008 |&gt; ggplot(aes(day, margin)) +\n  geom_point() + \n  geom_smooth(method = loess)\n\n\n\n\n\n\n\nBut be careful with default parameters as they are rarely optimal. However, you can conveniently change them:\n\npolls_2008 |&gt; ggplot(aes(day, margin)) +\n  geom_point() + \n  geom_smooth(method = loess, method.args = list(span = 0.15, degree = 1))"
+    "title": "\n28  Smoothing\n",
+    "section": "\n28.5 Local weighted regression (loess)",
+    "text": "28.5 Local weighted regression (loess)\nA limitation of the bin smoother approach just described is that we need small windows for the approximately constant assumptions to hold. As a result, we end up with a small number of data points to average and obtain imprecise estimates \\(\\hat{f}(x)\\). Here we describe how local weighted regression (loess) permits us to consider larger window sizes. To do this, we will use a mathematical result, referred to as Taylor’s theorem, which tells us that if you look closely enough at any smooth function \\(f(x)\\), it will look like a line. To see why this makes sense, consider the curved edges gardeners make using straight-edged spades:\n\n(“Downing Street garden path edge”1 by Flickr user Number 102. CC-BY 2.0 license3.)\nInstead of assuming the function is approximately constant in a window, we assume the function is locally linear. We can consider larger window sizes with the linear assumption than with a constant. Instead of the one-week window, we consider a larger one in which the trend is approximately linear. We start with a three-week window and later consider and evaluate other options:\n\\[\nE[Y_i | X_i = x_i ] = \\beta_0 + \\beta_1 (x_i-x_0) \\mbox{   if   }  |x_i - x_0| \\leq 21\n\\]\nFor every point \\(x_0\\), loess defines a window and fits a line within that window. Here is an example showing the fits for \\(x_0=-125\\) and \\(x_0 = -55\\):\n\n#&gt; `geom_smooth()` using formula = 'y ~ x'\n\n\n\n\n\n\n\nThe fitted value at \\(x_0\\) becomes our estimate \\(\\hat{f}(x_0)\\). Below we show the procedure happening as we move from the -155 up to 0:\n\n\n\n\n\n\n\n\nThe final result is a smoother fit than the bin smoother since we use larger sample sizes to estimate our local parameters:\n\ntotal_days &lt;- diff(range(polls_2008$day))\nspan &lt;- 21/total_days\nfit &lt;- loess(margin ~ day, degree = 1, span = span, data = polls_2008)\npolls_2008 |&gt; mutate(smooth = fit$fitted) |&gt;\n  ggplot(aes(day, margin)) +\n  geom_point(size = 3, alpha = .5, color = \"grey\") +\n  geom_line(aes(day, smooth), color = \"red\")\n\n\n\n\n\n\n\nDifferent spans give us different estimates. We can see how different window sizes lead to different estimates:\n\n\n\n\n\n\n\n\nHere are the final estimates:\n\n\n\n\n\n\n\n\nThere are three other differences between loess and the typical bin smoother.\n1. Rather than keeping the bin size the same, loess keeps the number of points used in the local fit the same. This number is controlled via the span argument, which expects a proportion. For example, if N is the number of data points and span=0.5, then for a given \\(x\\), loess will use the 0.5 * N closest points to \\(x\\) for the fit.\n2. When fitting a line locally, loess uses a weighted approach. Basically, instead of using least squares, we minimize a weighted version:\n\\[\n\\sum_{i=1}^N w_0(x_i) \\left[Y_i - \\left\\{\\beta_0 + \\beta_1 (x_i-x_0)\\right\\}\\right]^2\n\\]\nHowever, instead of the Gaussian kernel, loess uses a function called the Tukey tri-weight:\n\\[\nW(u)= \\left( 1  - |u|^3\\right)^3 \\mbox{ if } |u| \\leq 1 \\mbox{ and } W(u) = 0 \\mbox{ if } |u| &gt; 1\n\\]\nTo define the weights, we denote \\(2h\\) as the window size and define:\n\\[\nw_0(x_i) = W\\left(\\frac{x_i - x_0}{h}\\right)\n\\]\nThis kernel differs from the Gaussian kernel in that more points get values closer to the max:\n\n\n\n\n\n\n\n\n3. loess has the option of fitting the local model robustly. An iterative algorithm is implemented in which, after fitting a model in one iteration, outliers are detected and down-weighted for the next iteration. To use this option, we use the argument family=\"symmetric\".\n\n28.5.1 Fitting parabolas\nTaylor’s theorem also tells us that if you look at any mathematical function closely enough, it looks like a parabola. The theorem also states that you don’t have to look as closely when approximating with parabolas as you do when approximating with lines. This means we can make our windows even larger and fit parabolas instead of lines.\n\\[\nE[Y_i | X_i = x_i ] = \\beta_0 + \\beta_1 (x_i-x_0) + \\beta_2 (x_i-x_0)^2 \\mbox{   if   }  |x_i - x_0| \\leq h\n\\]\nYou may have noticed that when we showed the code for using loess, we set degree = 1. This tells loess to fit polynomials of degree 1, a fancy name for lines. If you read the help page for loess, you will see that the argument degree defaults to 2. By default, loess fits parabolas not lines. Here is a comparison of the fitting lines (red dashed) and fitting parabolas (orange solid):\n\ntotal_days &lt;- diff(range(polls_2008$day))\nspan &lt;- 28/total_days\nfit_1 &lt;- loess(margin ~ day, degree = 1, span = span, data = polls_2008)\nfit_2 &lt;- loess(margin ~ day, span = span, data = polls_2008)\n\npolls_2008 |&gt; mutate(smooth_1 = fit_1$fitted, smooth_2 = fit_2$fitted) |&gt;\n  ggplot(aes(day, margin)) +\n  geom_point(size = 3, alpha = .5, color = \"grey\") +\n  geom_line(aes(day, smooth_1), color = \"red\", lty = 2) +\n  geom_line(aes(day, smooth_2), color = \"orange\", lty = 1) \n\n\n\n\n\n\n\nThe degree = 2 gives us more wiggly results. In general, we actually prefer degree = 1 as it is less prone to this kind of noise.\n\n28.5.2 Beware of default smoothing parameters\nggplot uses loess in its geom_smooth function:\n\npolls_2008 |&gt; ggplot(aes(day, margin)) +\n  geom_point() + \n  geom_smooth(method = loess)\n\n\n\n\n\n\n\nBut be careful with default parameters as they are rarely optimal. However, you can conveniently change them:\n\npolls_2008 |&gt; ggplot(aes(day, margin)) +\n  geom_point() + \n  geom_smooth(method = loess, method.args = list(span = 0.15, degree = 1))"
   },
   {
     "objectID": "ml/smoothing.html#sec-smoothing-ml-connection",
     "href": "ml/smoothing.html#sec-smoothing-ml-connection",
-    "title": "\n27  Smoothing\n",
-    "section": "\n27.6 Connecting smoothing to machine learning",
-    "text": "27.6 Connecting smoothing to machine learning\nTo see how smoothing relates to machine learning with a concrete example, consider again our Section 27.1 example. If we define the outcome \\(Y = 1\\) for digits that are seven and \\(Y=0\\) for digits that are 2, then we are interested in estimating the conditional probability:\n\\[\np(x_1, x_2) = \\mbox{Pr}(Y=1 \\mid X_1=x_1 , X_2 = x_2).\n\\]\nwith \\(X_1\\) and \\(X_2\\) the two predictors defined in Section Section 27.1). In this example, the 0s and 1s we observe are “noisy” because for some regions the probabilities \\(p(x_1, x_2)\\) are not that close to 0 or 1. So we need to estimate \\(p(x_1, x_2)\\). Smoothing is an alternative to accomplishing this. In Section @ref(two-or-seven) we saw that linear regression was not flexible enough to capture the non-linear nature of \\(p(x_1, x_2)\\), thus smoothing approaches provide an improvement. In the next chapter we describe a popular machine learning algorithm, k-nearest neighbors, which is based on bin smoothing."
+    "title": "\n28  Smoothing\n",
+    "section": "\n28.6 Connecting smoothing to machine learning",
+    "text": "28.6 Connecting smoothing to machine learning\nTo see how smoothing relates to machine learning with a concrete example, consider again our Section 28.1 example. If we define the outcome \\(Y = 1\\) for digits that are seven and \\(Y=0\\) for digits that are 2, then we are interested in estimating the conditional probability:\n\\[\np(\\mathbf{x}) = \\mbox{Pr}(Y=1 \\mid X_1=x_1 , X_2 = x_2).\n\\]\nwith \\(x_1\\) and \\(x_2\\) the two predictors defined in Section 28.1. In this example, the 0s and 1s we observe are “noisy” because for some regions the probabilities \\(p(\\mathbf{x})\\) are not that close to 0 or 1. We therefore need to estimate \\(p(\\mathbf{x})\\). Smoothing is an alternative to accomplishing this. In Section 28.1, we saw that linear regression was not flexible enough to capture the non-linear nature of \\(p(\\mathbf{x})\\), thus smoothing approaches provide an improvement. In Section 29.1, we describe a popular machine learning algorithm, k-nearest neighbors, which is based on the concept of smoothing."
   },
   {
     "objectID": "ml/smoothing.html#exercises",
     "href": "ml/smoothing.html#exercises",
-    "title": "\n27  Smoothing\n",
-    "section": "\n27.7 Exercises",
-    "text": "27.7 Exercises\n1. In the wrangling part of this book, we used the code below to obtain mortality counts for Puerto Rico for 2015-2018.\n\nlibrary(dslabs)\nhead(pr_death_counts)\n\nRemove data from before May 2018, then use the loess function to obtain a smooth estimate of the expected number of deaths as a function of date. Plot this resulting smooth function. Make the span about two months long.\n2. Plot the smooth estimates against day of the year, all on the same plot but with different colors.\n3. Suppose we want to predict 2s and 7s in our mnist_27 dataset with just the second covariate. Can we do this? On first inspection it appears the data does not have much predictive power. In fact, if we fit a regular logistic regression, the coefficient for x_2 is not significant!\n\nlibrary(broom)\nlibrary(dslabs)\nmnist_27$train |&gt; \n  glm(y ~ x_2, family = \"binomial\", data = _) |&gt; \n  tidy()\n\nPlotting a scatterplot here is not useful since y is binary:\n\nqplot(x_2, y, data = mnist_27$train)\n\nFit a loess line to the data above and plot the results. Notice that there is predictive power, except the conditional probability is not linear."
+    "title": "\n28  Smoothing\n",
+    "section": "\n28.7 Exercises",
+    "text": "28.7 Exercises\n1. The dslabs package provides the following dataset with mortality counts for Puerto Rico for 2015-2018.\n\nlibrary(dslabs)\nhead(pr_death_counts)\n\nRemove data from before May 2018, then use the loess function to obtain a smooth estimate of the expected number of deaths as a function of date. Plot this resulting smooth function. Make the span about two months long.\n2. Plot the smooth estimates against day of the year, all on the same plot but with different colors.\n3. Suppose we want to predict 2s and 7s in our mnist_27 dataset with just the second covariate. Can we do this? On first inspection it appears the data does not have much predictive power. In fact, if we fit a regular logistic regression, the coefficient for x_2 is not significant!\n\nlibrary(broom)\nlibrary(dslabs)\nmnist_27$train |&gt; \n  glm(y ~ x_2, family = \"binomial\", data = _) |&gt; \n  tidy()\n\nPlotting a scatterplot here is not useful since y is binary:\n\nqplot(x_2, y, data = mnist_27$train)\n\nFit a loess line to the data above and plot the results. Notice that there is predictive power, except the conditional probability is not linear."
   },
   {
     "objectID": "ml/smoothing.html#footnotes",
     "href": "ml/smoothing.html#footnotes",
-    "title": "\n27  Smoothing\n",
+    "title": "\n28  Smoothing\n",
     "section": "",
     "text": "https://www.flickr.com/photos/49707497@N06/7361631644↩︎\nhttps://www.flickr.com/photos/number10gov/↩︎\nhttps://creativecommons.org/licenses/by/2.0/↩︎"
   },
   {
     "objectID": "ml/cross-validation.html#sec-knn-cv-intro",
     "href": "ml/cross-validation.html#sec-knn-cv-intro",
-    "title": "28  Cross validation",
-    "section": "\n28.1 Motivation with k-nearest neighbors",
-    "text": "28.1 Motivation with k-nearest neighbors\nLet’s start by loading the data and showing a plot of the predictors with outcome represented with color.\n\nlibrary(tidyverse)\nlibrary(dslabs)\nmnist_27$test |&gt; ggplot(aes(x_1, x_2, color = y)) +  geom_point()\n\n\n\n\n\n\n\nWe will use these data to estimate the conditional probability function\n\\[\np(x_1, x_2) = \\mbox{Pr}(Y = 1 \\mid X_1 = x_1 , X_2 = x_2).\n\\] as defined in Section Section 27.6). With k-nearest neighbors (kNN) we estimate \\(p(x_1, x_2)\\) in a similar way to bin smoothing. However, as we will see, kNN is easier to adapt to multiple dimensions. First we define the distance between all observations based on the features. Then, for any point \\((x_1,x_2)\\) for which we want an estimate of \\(p(x_1, x_2)\\), we look for the \\(k\\) nearest points to \\((x_1,x_2)\\) and then take an average of the 0s and 1s associated with these points. We refer to the set of points used to compute the average as the neighborhood. Due to the connection we described earlier between conditional expectations and conditional probabilities, this gives us a \\(\\hat{p}(x_1,x_2\\), just like the bin smoother gave us an estimate of a trend. As with bin smoothers, we can control the flexibility of our estimate, in this case through the \\(k\\) parameter: larger \\(k\\)s result in smoother estimates, while smaller \\(k\\)s result in more flexible and more wiggly estimates.\nTo implement the algorithm, we can use the knn3 function from the caret package. Looking at the help file for this package, we see that we can call it in one of two ways. We will use the first in which we specify a formula and a data frame. The data frame contains all the data to be used. The formula has the form outcome ~ predictor_1 + predictor_2 + predictor_3 and so on. Therefore, we would type y ~ x_1 + x_2. If we are going to use all the predictors, we can use the . like this y ~ .. For this function, we also need to pick a parameter: the number of neighbors to include. Let’s start with the default \\(k = 5\\). The final call looks like this:\n\nlibrary(caret)\nknn_fit &lt;- knn3(y ~ ., data = mnist_27$train, k = 5)\n\nIn this case, since our dataset is balanced and we care just as much about sensitivity as we do about specificity, we will use accuracy to quantify performance.\nThe predict function for knn produces a probability for each class. We keep the probability of being a 7 as the estimate \\(\\hat{p}(x_1, x_2)\\)\n\ny_hat_knn &lt;- predict(knn_fit, mnist_27$test, type = \"class\")\nconfusionMatrix(y_hat_knn, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.815\n\nIn Section Section 27.1 we used linear regression to generate an estimate.\n\nfit_lm &lt;- mnist_27$train |&gt; \n  mutate(y = ifelse(y == 7, 1, 0)) |&gt; \n  lm(y ~ x_1 + x_2, data = _)\np_hat_lm &lt;- predict(fit_lm, mnist_27$test)\ny_hat_lm &lt;- factor(ifelse(p_hat_lm &gt; 0.5, 7, 2))\nconfusionMatrix(y_hat_lm, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;     0.75\n\nAnd we see that kNN, with the default parameter, already beats regression. To see why this is the case, we will plot \\(\\hat{p}(x_1, x_2)\\) and compare it to the true conditional probability \\(p(x_1, x_2)\\):\n\n\n\n\n\n\n\n\nWe see that kNN better adapts to the non-linear shape of \\(p(x_1, x_2)\\). However, our estimate has some islands of blue in the red area, which intuitively does not make much sense. This is due to what we call over-training. We describe over-training in detail below. Over-training is the reason that we have higher accuracy in the train set compared to the test set:\n\ny_hat_knn &lt;- predict(knn_fit, mnist_27$train, type = \"class\")\nconfusionMatrix(y_hat_knn, mnist_27$train$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.882\n\ny_hat_knn &lt;- predict(knn_fit, mnist_27$test, type = \"class\")\nconfusionMatrix(y_hat_knn, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.815"
+    "title": "29  Resampling methods",
+    "section": "\n29.1 Motivation with k-nearest neighbors",
+    "text": "29.1 Motivation with k-nearest neighbors\nWe are interested in estimating the conditional probability function:\n\\[\np(\\mathbf{x}) = \\mbox{Pr}(Y = 1 \\mid X_1 = x_1 , X_2 = x_2).\n\\]\nas defined in Section 28.6.\nWith k-nearest neighbors (kNN) we estimate \\(p(\\mathbf{x})\\) in a similar way to bin smoothing. First, we define the distance between all observations based on the features. Then, for any point \\(\\mathbf{x}_0\\), we estimate \\(p(\\mathbf{x})\\) by identifying the \\(k\\) nearest points to \\(mathbf{x}_0\\) and afterwards taking an average of the \\(y\\)s associated with these points. We refer to the set of points used to compute the average as the neighborhood.\nDue to the connection we described earlier between conditional expectations and conditional probabilities, this gives us \\(\\hat{p}(\\mathbf{x}_0)\\), just like the bin smoother gave us an estimate of a trend. As with bin smoothers, we can control the flexibility of our estimate through the \\(k\\) parameter: larger \\(k\\)s result in smoother estimates, while smaller \\(k\\)s result in more flexible and wiggly estimates.\nTo implement the algorithm, we can use the knn3 function from the caret package. Looking at the help file for this package, we see that we can call it in one of two ways. We will use the first way in which we specify a formula and a data frame. The data frame contains all the data to be used. The formula has the form outcome ~ predictor_1 + predictor_2 + predictor_3 and so on. Therefore, we type y ~ x_1 + x_2. If we are going to use variables in the data frame, we can use the . like this y ~ .. We also need to pick \\(k\\), which is set to k = 5 by default. The final call looks like this:\n\nlibrary(dslabs)\nlibrary(caret)\nknn_fit &lt;- knn3(y ~ ., data = mnist_27$train, k = 5)\n\nIn this case, since our dataset is balanced and we care just as much about sensitivity as we do about specificity, we will use accuracy to quantify performance.\nThe predict function for knn3 produces a probability for each class. We can keep the probability of being a 7 as the estimate \\(\\hat{p}(\\mathbf{x})\\) using type = \"prob\". Here we obtain the actual prediction using type = \"class\":\n\ny_hat_knn &lt;- predict(knn_fit, mnist_27$test, type = \"class\")\nconfusionMatrix(y_hat_knn, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.815\n\nWe see that kNN, with the default parameter, already beats regression. To see why this is the case, we plot \\(\\hat{p}(\\mathbf{x})\\) and compare it to the true conditional probability \\(p(\\mathbf{x})\\):\n\n\n\n\n\n\n\n\nWe see that kNN better adapts to the non-linear shape of \\(p(\\mathbf{x})\\). However, our estimate has some islands of blue in the red area, which intuitively does not make much sense. We notice that we have higher accuracy in the train set compared to the test set:\n\ny_hat_knn &lt;- predict(knn_fit, mnist_27$train, type = \"class\")\nconfusionMatrix(y_hat_knn, mnist_27$train$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.858\n\ny_hat_knn &lt;- predict(knn_fit, mnist_27$test, type = \"class\")\nconfusionMatrix(y_hat_knn, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.815\n\nThis is due to what we call over-training."
   },
   {
     "objectID": "ml/cross-validation.html#over-training",
     "href": "ml/cross-validation.html#over-training",
-    "title": "28  Cross validation",
-    "section": "\n28.2 Over-training",
-    "text": "28.2 Over-training\nOver-training is at its worst when we set \\(k = 1\\). With \\(k = 1\\), the estimate for each \\((x_1, x_2)\\) in the training set is obtained with just the \\(y\\) corresponding to that point. In this case, if the \\((x_1, x_2)\\) are unique, we will obtain perfect accuracy in the training set because each point is used to predict itself. Remember that if the predictors are not unique and have different outcomes for at least one set of predictors, then it is impossible to predict perfectly.\nHere we fit a kNN model with \\(k = 1\\):\n\nknn_fit_1 &lt;- knn3(y ~ ., data = mnist_27$train, k = 1)\ny_hat_knn_1 &lt;- predict(knn_fit_1, mnist_27$train, type = \"class\")\nconfusionMatrix(y_hat_knn_1, mnist_27$train$y)$overall[[\"Accuracy\"]]\n#&gt; [1] 0.996\n\nHowever, the test set accuracy is actually worse than regression:\n\ny_hat_knn_1 &lt;- predict(knn_fit_1, mnist_27$test, type = \"class\")\nconfusionMatrix(y_hat_knn_1, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;     0.73\n\nWe can see the over-fitting problem in this figure.\n\n\n\n\n\n\n\n\nThe black curves denote the decision rule boundaries.\nThe estimate \\(\\hat{p}(x_1, x_2)\\) follows the training data too closely (left). You can see that in the training set, boundaries have been drawn to perfectly surround a single red point in a sea of blue. Because most points \\((x_1, x_2)\\) are unique, the prediction is either 1 or 0 and the prediction for that point is the associated label. However, once we introduce the training set (right), we see that many of these small islands now have the opposite color and we end up making several incorrect predictions."
+    "title": "29  Resampling methods",
+    "section": "\n29.2 Over-training",
+    "text": "29.2 Over-training\nWith kNN, over-training is at its worst when we set \\(k = 1\\). With \\(k = 1\\), the estimate for each \\(\\mathbf{x}\\) in the training set is obtained with just the \\(y\\) corresponding to that point. In this case, if the \\(x_1\\) and \\(x_2\\) are unique, we will obtain perfect accuracy in the training set because each point is used to predict itself (if the predictors are not unique and have different outcomes for at least one set of predictors, then it is impossible to predict perfectly).\nHere we fit a kNN model with \\(k = 1\\) and confirm we get nearer to perfect accuracy in the training set:\n\nknn_fit_1 &lt;- knn3(y ~ ., data = mnist_27$train, k = 1)\ny_hat_knn_1 &lt;- predict(knn_fit_1, mnist_27$train, type = \"class\")\nconfusionMatrix(y_hat_knn_1, mnist_27$train$y)$overall[[\"Accuracy\"]]\n#&gt; [1] 0.991\n\nBut in the test set, accuracy is actually worse than what we obtained with regression:\n\ny_hat_knn_1 &lt;- predict(knn_fit_1, mnist_27$test, type = \"class\")\nconfusionMatrix(y_hat_knn_1, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;     0.81\n\nWe can see the over-fitting problem by plotting the decision rule boundaries produced by \\(p(\\mathbf{x})\\):\n\n\n\n\n\n\n\n\nThe estimate \\(\\hat{p}(\\mathbf{x})\\) follows the training data too closely (left). You can see that, in the training set, boundaries have been drawn to perfectly surround a single red point in a sea of blue. Because most points \\(\\mathbf{x}\\) are unique, the prediction is either 1 or 0 and the prediction for that point is the associated label. However, once we introduce the test set (right), we see that many of these small islands now have the opposite color and we end up making several incorrect predictions."
   },
   {
     "objectID": "ml/cross-validation.html#over-smoothing",
     "href": "ml/cross-validation.html#over-smoothing",
-    "title": "28  Cross validation",
-    "section": "\n28.3 Over-smoothing",
-    "text": "28.3 Over-smoothing\nAlthough not as badly as with the previous examples, we saw that with \\(k = 5\\) we also over-trained. Hence, we should consider a larger \\(k\\). Let’s try, as an example, a much larger number: \\(k = 401\\).\n\nknn_fit_401 &lt;- knn3(y ~ ., data = mnist_27$train, k = 401)\ny_hat_knn_401 &lt;- predict(knn_fit_401, mnist_27$test, type = \"class\")\nconfusionMatrix(y_hat_knn_401, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;     0.79\n\nThis turns out to be similar to regression:\n\n#&gt; Warning: The following aesthetics were dropped during statistical\n#&gt; transformation: fill\n#&gt; ℹ This can happen when ggplot fails to infer the correct grouping\n#&gt;   structure in the data.\n#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a\n#&gt;   numerical variable into a factor?\n#&gt; The following aesthetics were dropped during statistical\n#&gt; transformation: fill\n#&gt; ℹ This can happen when ggplot fails to infer the correct grouping\n#&gt;   structure in the data.\n#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a\n#&gt;   numerical variable into a factor?\n\n\n\n\n\n\n\nThis size of \\(k\\) is so large that it does not permit enough flexibility. We call this over-smoothing."
+    "title": "29  Resampling methods",
+    "section": "\n29.3 Over-smoothing",
+    "text": "29.3 Over-smoothing\nAlthough not as badly as with \\(k=1\\), we saw that with \\(k = 5\\) we also over-trained. Hence, we should consider a larger \\(k\\). Let’s try, as an example, a much larger number: \\(k = 401\\).\n\nknn_fit_401 &lt;- knn3(y ~ ., data = mnist_27$train, k = 401)\ny_hat_knn_401 &lt;- predict(knn_fit_401, mnist_27$test, type = \"class\")\nconfusionMatrix(y_hat_knn_401, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;     0.76\n\nThe estimate turns out to be similar to the one obtained with regression:\n\n\n\n\n\n\n\n\nIn this case, \\(k\\) is so large that it does not permit enough flexibility. We call this over-smoothing."
   },
   {
-    "objectID": "ml/cross-validation.html#picking-the-k-in-knn",
-    "href": "ml/cross-validation.html#picking-the-k-in-knn",
-    "title": "28  Cross validation",
-    "section": "\n28.4 Picking the \\(k\\) in kNN",
-    "text": "28.4 Picking the \\(k\\) in kNN\nSo how do we pick \\(k\\)? In principle we want to pick the \\(k\\) that maximizes accuracy, or minimizes the expected MSE as defined in Section 25.8. The goal of cross validation is to estimate these quantities for any given algorithm and set of tuning parameters such as \\(k\\). To understand why we need a special method to do this let’s repeat what we did above but for different values of \\(k\\):\n\nks &lt;- seq(3, 251, 2)\n\nWe do this using map_df function to repeat the above for each one.\n\nlibrary(purrr)\naccuracy &lt;- map_df(ks, function(k){\n  fit &lt;- knn3(y ~ ., data = mnist_27$train, k = k)\n  \n  y_hat &lt;- predict(fit, mnist_27$train, type = \"class\")\n  cm_train &lt;- confusionMatrix(y_hat, mnist_27$train$y)\n  train_error &lt;- cm_train$overall[\"Accuracy\"]\n  \n  y_hat &lt;- predict(fit, mnist_27$test, type = \"class\")\n  cm_test &lt;- confusionMatrix(y_hat, mnist_27$test$y)\n  test_error &lt;- cm_test$overall[\"Accuracy\"]\n  \n  tibble(train = train_error, test = test_error)\n})\n\nNote that we estimate accuracy by using both the training set and the test set. We can now plot the accuracy estimates for each value of \\(k\\):\n\n\n\n\n\n\n\n\nFirst, note that the estimate obtained on the training set is generally lower than the estimate obtained with the test set, with the difference larger for smaller values of \\(k\\). This is due to over-training. Also note that the accuracy versus \\(k\\) plot is quite jagged. We do not expect this because small changes in \\(k\\) should not affect the algorithm’s performance too much. The jaggedness is explained by the fact that the accuracy is computed on a sample and therefore is a random variable. This demonstrates why we prefer to minimize the expected loss rather than the loss we observe with one dataset.\nIf we were to use these estimates to pick the \\(k\\) that maximizes accuracy, we would use the estimates built on the test data:\n\nks[which.max(accuracy$test)]\n#&gt; [1] 41\nmax(accuracy$test)\n#&gt; [1] 0.86\n\nAnother reason we need a better estimate of accuracy is that if we use the test set to pick this \\(k\\), we should not expect the accompanying accuracy estimate to extrapolate to the real world. This is because even here we broke a golden rule of machine learning: we selected the \\(k\\) using the test set. Cross validation also provides an estimate that takes this into account."
+    "objectID": "ml/cross-validation.html#parameter-tuning",
+    "href": "ml/cross-validation.html#parameter-tuning",
+    "title": "29  Resampling methods",
+    "section": "\n29.4 Parameter tuning",
+    "text": "29.4 Parameter tuning\nIt is very common for machine learning algorithms to require that we set a value, or values, before we even fit the model. An example is the \\(k\\) in kNN. In Chapter 30 we learn of other examples. These values are referred to as parameters and an important step in machine learning in practice is picking or tunning those parameters.\nSo how do we tune these parameters? For example, how do we pick the \\(k\\) in kNN? In principle, we want to pick the \\(k\\) that maximizes accuracy or minimizes the expected MSE as defined in Section 26.8. The goal of resampling methods is to estimate these quantities for any given algorithm and set of tuning parameters such as \\(k\\). To understand why we need a special method to do this, let’s repeat what we did above, comparing the training set and test set accuracy, but for different values of \\(k\\). We can plot the accuracy estimates for each value of \\(k\\):\n\n\n\n\n\n\n\n\nFirst, note that the estimate obtained on the training set is generally lower than the estimate obtained with the test set, with the difference larger for smaller values of \\(k\\). This is due to over-training.\nSo do we simply pick the \\(k\\) that maximizes accuracy and report this accuracy? There are two problems with this approach:\n\nThe accuracy versus \\(k\\) plot is quite jagged. We do not expect this because small changes in \\(k\\) should not affect the algorithm’s performance so much. The jaggedness is explained by the fact that the accuracy is computed on a sample and therefore is a random variable.\nAlthough for each \\(k\\) we estimated MSE using the test set, we used the test set to pick the best \\(k\\). As a result, we should not expect this minimum test set accuracy to extrapolate to the real world.\n\nResampling methods provide a solution to both these problems."
   },
   {
-    "objectID": "ml/cross-validation.html#mathematical-description-of-cross-validation",
-    "href": "ml/cross-validation.html#mathematical-description-of-cross-validation",
-    "title": "28  Cross validation",
-    "section": "\n28.5 Mathematical description of cross validation",
-    "text": "28.5 Mathematical description of cross validation\nIn Section Section 25.8, we described that a common goal of machine learning is to find an algorithm that produces predictors \\(\\hat{Y}\\) for an outcome \\(Y\\) that minimizes the MSE:\n\\[\n\\mbox{MSE} = \\mbox{E}\\left\\{ \\frac{1}{N}\\sum_{i = 1}^N (\\hat{Y}_i - Y_i)^2 \\right\\}\n\\]\nWhen all we have at our disposal is one dataset, we can estimate the MSE with the observed MSE like this:\n\\[\n\\hat{\\mbox{MSE}} = \\frac{1}{N}\\sum_{i = 1}^N (\\hat{y}_i - y_i)^2\n\\]\nThese two are often referred to as the true error and apparent error, respectively.\nThere are two important characteristics of the apparent error we should always keep in mind:\n\nBecause our data is random, the apparent error is a random variable. For example, the dataset we have may be a random sample from a larger population. An algorithm may have a lower apparent error than another algorithm due to luck.\nIf we train an algorithm on the same dataset that we use to compute the apparent error, we might be overtraining. In general, when we do this, the apparent error will be an underestimate of the true error. We will see an extreme example of this with k-nearest neighbors.\n\nCross validation is a technique that permits us to alleviate both these problems. To understand cross validation, it helps to think of the true error, a theoretical quantity, as the average of many apparent errors obtained by applying the algorithm to \\(B\\) new random samples of the data, none of them used to train the algorithm. As shown in a previous chapter, we think of the true error as:\n\\[\n\\frac{1}{B} \\sum_{b = 1}^B \\frac{1}{N}\\sum_{i = 1}^N \\left(\\hat{y}_i^b - y_i^b\\right)^2\n\\]\nwith \\(B\\) a large number that can be thought of as practically infinite. As already mentioned, this is a theoretical quantity because we only have available one set of outcomes: \\(y_1, \\dots, y_n\\). Cross validation is based on the idea of imitating the theoretical setup above as best we can with the data we have. To do this, we have to generate a series of different random samples. There are several approaches we can use, but the general idea for all of them is to randomly generate smaller datasets that are not used for training, and instead used to estimate the true error."
+    "objectID": "ml/cross-validation.html#mathematical-description-of-resampling-methods",
+    "href": "ml/cross-validation.html#mathematical-description-of-resampling-methods",
+    "title": "29  Resampling methods",
+    "section": "\n29.5 Mathematical description of resampling methods",
+    "text": "29.5 Mathematical description of resampling methods\nIn the previous section, we introduced kNN as an example to motivate the topic of this chapter. In this particular case, there is just one parameter, \\(k\\), that affects the performance of the algorithm. However, in general, machine algorithms may have multiple parameters so we use the notation \\(\\lambda\\) to represent any set of parameters needed to define a machine learning algorithm. We also introduce notation to distinguish the predictions you get with each set of parameters with \\(\\hat{y}(\\lambda)\\) and the MSE for this choice with \\(\\text{MSE}(\\lambda)\\). Our goal is to find the \\(\\lambda\\) that minimizes \\(\\text{MSE}(\\lambda)\\). Resampling method help us estimate \\(\\text{MSE}(\\lambda)\\).\nA intuitive first attempt is the apparent error defined in Section 26.8 and used in the previous section:\n\\[\n\\hat{\\mbox{MSE}}(\\lambda) = \\frac{1}{N}\\sum_{i = 1}^N \\left\\{\\hat{y}_i(\\lambda) - y_i\\right\\}^2\n\\]\nAs noted in the previous section, this estimate is a random error, based on just one test set, with enough variability to affect the choice of the best \\(\\lambda\\) substantially.\nNow imagine a world in which we could obtain data repeatedly, say from new random samples. We could take a very large number \\(B\\) of new samples, split them into training and test sets, and define:\n\\[\n\\frac{1}{B} \\sum_{b=1}^B \\frac{1}{N}\\sum_{i=1}^N \\left\\{\\hat{y}_i^b(\\lambda) - y_i^b\\right\\}^2\n\\]\nwith \\(y_i^b\\) the \\(i\\)th observation in sample \\(b\\) and \\(\\hat{y}_{i}^b(\\lambda)\\) the prediction obtained with the algorithm defined by parameter \\(\\lambda\\) and trained independently of \\(y_i^b\\). The law of large numbers tells us that as \\(B\\) becomes larger, this quantity gets closer and closer to \\(MSE(\\lambda)\\). This is of course a theoretical consideration as we rarely get access to more than one dataset for algorithm development, but the concept inspires the idea behind resampling methods.\nThe general idea is to generate a series of different random samples from the data at hand. There are several approaches to doing this, but all randomly generate several smaller datasets that are not used for training, and instead are used to estimate MSE. Next, we describe cross validation, one of the most widely used resampling resampling methods."
   },
   {
-    "objectID": "ml/cross-validation.html#k-fold-cross-validation",
-    "href": "ml/cross-validation.html#k-fold-cross-validation",
-    "title": "28  Cross validation",
-    "section": "\n28.6 K-fold cross validation",
-    "text": "28.6 K-fold cross validation\nThe first one we describe is K-fold cross validation. Generally speaking, a machine learning challenge starts with a dataset (blue in the image below). We need to build an algorithm using this dataset that will eventually be used in completely independent datasets (yellow).\n\n\n\n\n\n\n\n\nBut we don’t get to see these independent datasets.\n\n\n\n\n\n\n\n\nSo to imitate this situation, we carve out a piece of our dataset and pretend it is an independent dataset: we divide the dataset into a training set (blue) and a test set (red). We will train our algorithm exclusively on the training set and use the test set only for evaluation purposes.\nWe usually try to select a small piece of the dataset so that we have as much data as possible to train. However, we also want the test set to be large so that we obtain a stable estimate of the loss without fitting an impractical number of models. Typical choices are to use 10%-20% of the data for testing.\n\n\n\n\n\n\n\n\nLet’s reiterate that it is indispensable that we not use the test set at all: not for filtering out rows, not for selecting features, nothing!\nNow this presents a new problem because for most machine learning algorithms we need to select parameters, for example the number of neighbors \\(k\\) in k-nearest neighbors. Here, we will refer to the set of parameters as \\(\\lambda\\). We need to optimize algorithm parameters without using our test set and we know that if we optimize and evaluate on the same dataset, we will overtrain. This is where cross validation is most useful.\nFor each set of algorithm parameters being considered, we want an estimate of the MSE and then we will choose the parameters with the smallest MSE. Cross validation provides this estimate.\nFirst, before we start the cross validation procedure, it is important to fix all the algorithm parameters. Although we will train the algorithm on the set of training sets, the parameters \\(\\lambda\\) will be the same across all training sets. We will use \\(\\hat{y}_i(\\lambda)\\) to denote the predictors obtained when we use parameters \\(\\lambda\\).\nSo, if we are going to imitate this definition:\n\\[\n\\mbox{MSE}(\\lambda) = \\frac{1}{B} \\sum_{b = 1}^B \\frac{1}{N}\\sum_{i = 1}^N \\left(\\hat{y}_i^b(\\lambda) - y_i^b\\right)^2\n\\]\nwe want to consider datasets that can be thought of as an independent random sample and we want to do this several times. With K-fold cross validation, we do it \\(K\\) times. In the cartoons, we are showing an example that uses \\(K = 5\\).\nWe will eventually end up with \\(K\\) samples, but let’s start by describing how to construct the first: we simply pick \\(M = N/K\\) observations at random (we round if \\(M\\) is not a round number) and think of these as a random sample \\(y_1^b, \\dots, y_M^b\\), with \\(b = 1\\). We call this the validation set:\n\n\n\n\n\n\n\n\nNow we can fit the model in the training set, then compute the apparent error on the independent set:\n\\[\n\\hat{\\mbox{MSE}}_b(\\lambda) = \\frac{1}{M}\\sum_{i = 1}^M \\left(\\hat{y}_i^b(\\lambda) - y_i^b\\right)^2\n\\]\nNote that this is just one sample and will therefore return a noisy estimate of the true error. This is why we take \\(K\\) samples, not just one. In K-cross validation, we randomly split the observations into \\(K\\) non-overlapping sets:\n\n\n\n\n\n\n\n\nNow we repeat the calculation above for each of these sets \\(b = 1,\\dots,K\\) and obtain \\(\\hat{\\mbox{MSE}}_1(\\lambda),\\dots, \\hat{\\mbox{MSE}}_K(\\lambda)\\). Then, for our final estimate, we compute the average:\n\\[\n\\hat{\\mbox{MSE}}(\\lambda) = \\frac{1}{K} \\sum_{b = 1}^K \\hat{\\mbox{MSE}}_b(\\lambda)\n\\]\nand obtain an estimate of our loss. A final step would be to select the \\(\\lambda\\) that minimizes the MSE.\nWe have described how to use cross validation to optimize parameters. However, we now have to take into account the fact that the optimization occurred on the training data and therefore we need an estimate of our final algorithm based on data that was not used to optimize the choice. Here is where we use the test set we separated early on:\n\n\n\n\n\n\n\n\nWe can do cross validation again:\n\n\n\n\n\n\n\n\nand obtain a final estimate of our expected loss. However, note that this means that our entire compute time gets multiplied by \\(K\\). You will soon learn that performing this task takes time because we are performing many complex computations. As a result, we are always looking for ways to reduce this time. For the final evaluation, we often just use the one test set.\nOnce we are satisfied with this model and want to make it available to others, we could refit the model on the entire dataset, without changing the optimized parameters.\n\n\n\n\n\n\n\n\nNow how do we pick the cross validation \\(K\\)? Large values of \\(K\\) are preferable because the training data better imitates the original dataset. However, larger values of \\(K\\) will have much slower computation time: for example, 100-fold cross validation will be 10 times slower than 10-fold cross validation. For this reason, the choices of \\(K = 5\\) and \\(K = 10\\) are popular.\nOne way we can improve the variance of our final estimate is to take more samples. To do this, we would no longer require the training set to be partitioned into non-overlapping sets. Instead, we would just pick \\(K\\) sets of some size at random.\nOne popular version of this technique, at each fold, picks observations at random with replacement (which means the same observation can appear twice). This approach has some advantages (not discussed here) and is generally referred to as the bootstrap. In fact, this is the default approach in the caret package. We describe how to implement cross validation with the caret package in the next chapter. In the next section, we include an explanation of how the bootstrap works in general.\n\n\n\n\n\n\nYou are ready to try exercises 1 through 8"
+    "objectID": "ml/cross-validation.html#cross-validation",
+    "href": "ml/cross-validation.html#cross-validation",
+    "title": "29  Resampling methods",
+    "section": "\n29.6 Cross validation",
+    "text": "29.6 Cross validation\nOverall, we are provided a dataset (blue) and we need to build an algorithm, using this dataset, that will eventually be used in completely independent datasets (yellow) that we might not even see.\n\n\n\n\n\n\n\n\nSo to imitate this situation, we start by carving out a piece of our dataset and pretend it is an independent dataset: we divide the dataset into a training set (blue) and a test set (red). We will train the entirety of our algorithm, including the choice of parameter \\(\\lambda\\), exclusively on the training set and use the test set only for evaluation purposes.\nWe usually try to select a small piece of the dataset so that we have as much data as possible to train. However, we also want the test set to be large so that we obtain a stable estimate of the MSE without fitting an impractical number of models. Typical choices are to use 10%-20% of the data for testing.\n\n\n\n\n\n\n\n\nLet’s reiterate that it is indispensable that we not use the test set at all: not for filtering out rows, not for selecting features, not for anything!\nBut then how do we optimize \\(\\lambda\\)? In cross validation, we achieve this by splitting the training set into two: the training and validation sets.\n\n\n\n\n\n\n\n\nWe will do this many times assuring that the estimates of MSE obtained in each dataset are independent from each other. There are several proposed methods for doing this. Here we describe one of these approaches, K-fold cross validation, in detail to provide the general idea used in all approaches.\n\n29.6.1 K-fold cross validation\nAs a reminder, we are going to imitate the concept used when introducing this version of the MSE:\n\\[\n\\mbox{MSE}(\\lambda) \\approx\\frac{1}{B} \\sum_{b = 1}^B \\frac{1}{N}\\sum_{i = 1}^N \\left(\\hat{y}_i^b(\\lambda) - y_i^b\\right)^2\n\\]\nWe want to generate a dataset that can be thought of as independent random sample, and do this \\(B\\) times. The K in K-fold cross validation, represents the number of time \\(B\\). In the illustrations, we are showing an example that uses \\(B = 5\\).\nWe will eventually end up with \\(B\\) samples, but let’s start by describing how to construct the first: we simply pick \\(M = N/B\\) observations at random (we round if \\(M\\) is not a round number) and think of these as a random sample \\(y_1^b, \\dots, y_M^b\\), with \\(b = 1\\). We call this the validation set.\nNow we can fit the model in the training set, then compute the apparent error on the independent set:\n\\[\n\\hat{\\mbox{MSE}}_b(\\lambda) = \\frac{1}{M}\\sum_{i = 1}^M \\left(\\hat{y}_i^b(\\lambda) - y_i^b\\right)^2\n\\]\nAs a reminder, this is just one sample and will therefore return a noisy estimate of the true error. In K-fold cross validation, we randomly split the observations into \\(B\\) non-overlapping sets:\n\n\n\n\n\n\n\n\nNow we repeat the calculation above for each of these sets \\(b = 1,\\dots,B\\) and obtain \\(\\hat{\\mbox{MSE}}_1(\\lambda),\\dots, \\hat{\\mbox{MSE}}_B(\\lambda)\\). Then, for our final estimate, we compute the average:\n\\[\n\\hat{\\mbox{MSE}}(\\lambda) = \\frac{1}{B} \\sum_{b = 1}^B \\hat{\\mbox{MSE}}_b(\\lambda)\n\\]\nand obtain an estimate of our loss. A final step would be to select the \\(\\lambda\\) that minimizes the MSE.\n\n29.6.2 How many folds?\nNow how do we pick the cross validation fold? Large values of \\(B\\) are preferable because the training data better imitates the original dataset. However, larger values of \\(B\\) will have much slower computation time: for example, 100-fold cross validation will be 10 times slower than 10-fold cross validation. For this reason, the choices of \\(B = 5\\) and \\(B = 10\\) are popular.\nOne way we can improve the variance of our final estimate is to take more samples. To do this, we would no longer require the training set to be partitioned into non-overlapping sets. Instead, we would just pick \\(B\\) sets of some size at random.\n\n29.6.3 Estimate MSE of our optimized algorithm\nWe have described how to use cross validation to optimize parameters. However, we now have to take into account the fact that the optimization occurred on the training data and we therefore need an estimate of our final algorithm based on data that was not used to optimize the choice. Here is where we use the test set we separated early on:\n\n\n\n\n\n\n\n\nWe can actually do cross validation again:\n\n\n\n\n\n\n\n\nand obtain a final estimate of our expected loss. However, note that last cross validation iteration means that our entire compute time gets multiplied by \\(K\\). You will soon learn that fitting each algorithm takes time because we are performing many complex computations. As a result, we are always looking for ways to reduce this time. For the final evaluation, we often just use the one test set.\nOnce we are satisfied with this model and want to make it available to others, we could refit the model on the entire dataset, without changing the optimized parameters."
   },
   {
-    "objectID": "ml/cross-validation.html#bootstrap",
-    "href": "ml/cross-validation.html#bootstrap",
-    "title": "28  Cross validation",
-    "section": "\n28.7 Bootstrap",
-    "text": "28.7 Bootstrap\nSuppose the income distribution of your population is as follows:\n\nset.seed(1995)\nn &lt;- 10^6\nincome &lt;- 10^(rnorm(n, log10(45000), log10(3)))\nqplot(log10(income), bins = 30, color = I(\"black\"))\n#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.\n\n\n\n\n\n\n\nThe population median is:\n\nm &lt;- median(income)\nm\n#&gt; [1] 44939\n\nSuppose we don’t have access to the entire population, but want to estimate the median \\(m\\). We take a sample of 100 and estimate the population median \\(m\\) with the sample median \\(M\\):\n\nN &lt;- 100\nX &lt;- sample(income, N)\nmedian(X)\n#&gt; [1] 38461\n\nCan we construct a confidence interval? What is the distribution of \\(M\\) ?\nBecause we are simulating the data, we can use a Monte Carlo simulation to learn the distribution of \\(M\\).\n\nlibrary(gridExtra)\nB &lt;- 10^4\nM &lt;- replicate(B, {\n  X &lt;- sample(income, N)\n  median(X)\n})\np1 &lt;- qplot(M, bins = 30, color = I(\"black\"))\np2 &lt;- qplot(sample = scale(M), xlab = \"theoretical\", ylab = \"sample\") + \n  geom_abline()\ngrid.arrange(p1, p2, ncol = 2)\n\n\n\n\n\n\n\nIf we know this distribution, we can construct a confidence interval. The problem here is that, as we have already described, in practice we do not have access to the distribution. In the past, we have used the Central Limit Theorem, but the CLT we studied applies to averages and here we are interested in the median. We can see that the 95% confidence interval based on CLT\n\nmedian(X) + 1.96 * sd(X) / sqrt(N) * c(-1, 1)\n#&gt; [1] 21018 55905\n\nis quite different from the confidence interval we would generate if we know the actual distribution of \\(M\\):\n\nquantile(M, c(0.025, 0.975))\n#&gt;  2.5% 97.5% \n#&gt; 34438 59050\n\nThe bootstrap permits us to approximate a Monte Carlo simulation without access to the entire distribution. The general idea is relatively simple. We act as if the observed sample is the population. We then sample (with replacement) datasets, of the same sample size as the original dataset. Then we compute the summary statistic, in this case the median, on these bootstrap samples.\nTheory tells us that, in many situations, the distribution of the statistics obtained with bootstrap samples approximate the distribution of our actual statistic. This is how we construct bootstrap samples and an approximate distribution:\n\nB &lt;- 10^4\nM_star &lt;- replicate(B, {\n  X_star &lt;- sample(X, N, replace = TRUE)\n  median(X_star)\n})\n\nNote a confidence interval constructed with the bootstrap is much closer to one constructed with the theoretical distribution:\n\nquantile(M_star, c(0.025, 0.975))\n#&gt;  2.5% 97.5% \n#&gt; 30253 56909\n\nFor more on the Bootstrap, including corrections one can apply to improve these confidence intervals, please consult the book An introduction to the bootstrap by Efron, B., & Tibshirani, R. J.\nNote that we can use ideas similar to those used in the bootstrap in cross validation: instead of dividing the data into equal partitions, we simply bootstrap many times."
+    "objectID": "ml/cross-validation.html#boostrap-resampling",
+    "href": "ml/cross-validation.html#boostrap-resampling",
+    "title": "29  Resampling methods",
+    "section": "\n29.7 Boostrap resampling",
+    "text": "29.7 Boostrap resampling\nTypically, cross-validation involves partitioning the original dataset into a training set to train the model and a testing set to evaluate it. With the bootstrap approach, based on the ideas described in Chapter 10, you can create multiple different training datasets via bootstrapping. This method is sometimes called bootstrap aggregating or bagging.\nIn bootstrap resampling, we create a large number of bootstrap samples from the original training dataset. Each bootstrap sample is created by randomly selecting observations with replacement, usually the same size as the original training dataset. For each bootstrap sample, we fit the model and compute the MSE estimate on the observations not selected in the random sampling, referred to as the out-of-bag observations. These out-of-bag observations serve a similar role to a validation set in standard cross-validation.\nWe then average the MSEs obtained in the out-of-bag observations from each bootstrap sample to estimate the model’s performance.\nThis approach is actually the default approach in the caret package. We describe how to implement resampling methods with the caret package in the next chapter.\n\n29.7.1 Comparison of MSE estimates\nIn Section 29.1, we computed an estimate of MSE based just on the provided test set (shown in red in the plot below). Here we show how the cross-validation techniques described above help reduce variability. The green curve below shows the results of applying K-fold cross validation with 10 folds, leaving out 10% of the data for validation. We can see that the variance is reduced substantially. The blue curve is the result of using 100 bootstrap samples to estimate MSE. The variability is reduced even further, but at the cost of a 10 fold increase in computation time.\n\nset.seed(2023-11-30)\nboot &lt;- train(y~., method = \"knn\", tuneGrid = data.frame(k=ks), \n              data = mnist_27$train, \n              trControl = trainControl(number = 100))\ncv &lt;- train(y ~ ., method = \"knn\", \n            tuneGrid = data.frame(k = ks), \n            data = mnist_27$train,\n            trControl = trainControl(method = \"cv\", \n                                     number = 10, p = .9))\n\ndata.frame(k = ks, naive = accuracy[\"test\",], \n           cv = cv$results[,2],\n           boot = boot$results[,2]) |&gt;\n  pivot_longer(-k, values_to = \"accuracy\", names_to = \"set\") |&gt;\n  mutate(set = factor(set, levels = c(\"naive\", \"cv\", \"boot\"),\n                      labels = c(\"Simple\", \"K-fold\", \"Boostrap\"))) |&gt;\n  ggplot(aes(k, accuracy, color = set)) + \n  geom_line()"
   },
   {
     "objectID": "ml/cross-validation.html#exercises",
     "href": "ml/cross-validation.html#exercises",
-    "title": "28  Cross validation",
-    "section": "\n28.8 Exercises",
-    "text": "28.8 Exercises\nGenerate a set of random predictors and outcomes like this:\n\nset.seed(1996)\nn &lt;- 1000\np &lt;- 10000\nx &lt;- matrix(rnorm(n * p), n, p)\ncolnames(x) &lt;- paste(\"x\", 1:ncol(x), sep = \"_\")\ny &lt;- rbinom(n, 1, 0.5) |&gt; factor()\n\nx_subset &lt;- x[ ,sample(p, 100)]\n\n1. Because x and y are completely independent, you should not be able to predict y using x with accuracy larger than 0.5. Confirm this by running cross validation using logistic regression to fit the model. Because we have so many predictors, we selected a random sample x_subset. Use the subset when training the model. Hint: use the caret train function. The results component of the output of train shows you the accuracy. Ignore the warnings.\n2. Now, instead of a random selection of predictors, we are going to search for those that are most predictive of the outcome. We can do this by comparing the values for the \\(y = 1\\) group to those in the \\(y = 0\\) group, for each predictor, using a t-test. You can perform this step like this:\n\ndevtools::install_bioc(\"genefilter\")\ninstall.packages(\"genefilter\")\nlibrary(genefilter)\ntt &lt;- colttests(x, y)\n\nCreate a vector of the p-values and call it pvals.\n3. Create an index ind with the column numbers of the predictors that were “statistically significantly” associated with y. Use a p-value cutoff of 0.01 to define “statistically significant”. How many predictors survive this cutoff?\n4. Re-run the cross validation but after redefining x_subset to be the subset of x defined by the columns showing “statistically significant” association with y. What is the accuracy now?\n5. Re-run the cross validation again, but this time using kNN. Try out the following grid of tuning parameters: k = seq(101, 301, 25). Make a plot of the resulting accuracy.\n6. In exercises 3 and 4, we see that despite the fact that x and y are completely independent, we were able to predict y with accuracy higher than 70%. We must be doing something wrong then. What is it?\n\nThe function train estimates accuracy on the same data it uses to train the algorithm.\nWe are over-fitting the model by including 100 predictors.\nWe used the entire dataset to select the columns used in the model. This step needs to be included as part of the algorithm. The cross validation was done after this selection.\nThe high accuracy is just due to random variability.\n\n7. Advanced. Re-do the cross validation but this time include the selection step in the cross validation. The accuracy should now be close to 50%.\n8. Load the tissue_gene_expression dataset. Use the train function to predict tissue from gene expression. Use kNN. What k works best?\n9. The createResample function can be used to create bootstrap samples. For example, we can create 10 bootstrap samples for the mnist_27 dataset like this:\n\nset.seed(1995)\nindexes &lt;- createResample(mnist_27$train$y, 10)\n\nHow many times do 3, 4, and 7 appear in the first re-sampled index?\n10. We see that some numbers appear more than once and others appear no times. This has to be this way for each dataset to be independent. Repeat the exercise for all the re-sampled indexes.\n11. Generate a random dataset like this:\n\ny &lt;- rnorm(100, 0, 1)\n\nEstimate the 75th quantile, which we know is:\n\nqnorm(0.75)\n\nwith the sample quantile:\n\nquantile(y, 0.75)\n\nRun a Monte Carlo simulation to learn the expected value and standard error of this random variable.\n12. In practice, we can’t run a Monte Carlo simulation because we don’t know if rnorm is being used to simulate the data. Use the bootstrap to estimate the standard error using just the initial sample y. Use 10 bootstrap samples.\n13. Redo exercise 12, but with 10,000 bootstrap samples."
-  },
-  {
-    "objectID": "ml/algorithms.html#linear-regression",
-    "href": "ml/algorithms.html#linear-regression",
-    "title": "\n29  Examples of algorithms\n",
-    "section": "\n29.1 Linear regression",
-    "text": "29.1 Linear regression\nLinear regression can be considered a machine learning algorithm. In Section Section 27.1 we demonstrated how linear regression can be too rigid to be useful. This is generally true, but for some challenges it works rather well. It also serves as a baseline approach: if you can’t beat it with a more complex approach, you probably want to stick to linear regression. To quickly make the connection between regression and machine learning, we will reformulate Galton’s study with heights, a continuous outcome.\n\nlibrary(HistData)\nset.seed(1983)\ngalton_heights &lt;- GaltonFamilies |&gt;\n  filter(gender == \"male\") |&gt;\n  group_by(family) |&gt;\n  sample_n(1) |&gt;\n  ungroup() |&gt;\n  select(father, childHeight) |&gt;\n  rename(son = childHeight)\n\nSuppose you are tasked with building a machine learning algorithm that predicts the son’s height \\(Y\\) using the father’s height \\(X\\). Let’s generate testing and training sets:\n\ny &lt;- galton_heights$son\ntest_index &lt;- createDataPartition(y, times = 1, p = 0.5, list = FALSE)\ntrain_set &lt;- galton_heights |&gt; slice(-test_index)\ntest_set &lt;- galton_heights |&gt; slice(test_index)\n\nIn this case, if we were just ignoring the father’s height and guessing the son’s height, we would guess the average height of sons.\n\nm &lt;- mean(train_set$son)\nm\n#&gt; [1] 69.2\n\nOur root mean squared error is:\n\nsqrt(mean((m - test_set$son)^2))\n#&gt; [1] 2.77\n\nCan we do better? In the regression chapter, we learned that if the pair \\((X,Y)\\) follow a bivariate normal distribution, the conditional expectation (what we want to estimate) is equivalent to the regression line:\n\\[\nf(x) = \\mbox{E}( Y  \\mid  X = x ) = \\beta_0 + \\beta_1 x\n\\]\nIn Section Section 13.10 we introduced least squares as a method for estimating the slope \\(\\beta_0\\) and intercept \\(\\beta_1\\):\n\nfit &lt;- lm(son ~ father, data = train_set)\nfit$coef\n#&gt; (Intercept)      father \n#&gt;      35.976       0.482\n\nThis gives us an estimate of the conditional expectation:\n\\[ \\hat{f}(x) = 35 + 0.5 x \\]\nWe can see that this does indeed provide an improvement over our guessing approach.\n\ny_hat &lt;- fit$coef[1] + fit$coef[2]*test_set$father\nsqrt(mean((y_hat - test_set$son)^2))\n#&gt; [1] 2.54\n\n\n29.1.1 The predict function\nThe predict function is very useful for machine learning applications. This function takes a fitted object from functions such as lm or glm (we learn about glm soon) and a data frame with the new predictors for which to predict. So in our current example, we would use predict like this:\n\ny_hat &lt;- predict(fit, test_set)\n\nUsing predict, we can get the same results as we did previously:\n\ny_hat &lt;- predict(fit, test_set)\nsqrt(mean((y_hat - test_set$son)^2))\n#&gt; [1] 2.54\n\npredict does not always return objects of the same types; it depends on what type of object is sent to it. To learn about the specifics, you need to look at the help file specific for the type of fit object that is being used. The predict is actually a special type of function in R (called a generic function) that calls other functions depending on what kind of object it receives. So if predict receives an object coming out of the lm function, it will call predict.lm. If it receives an object coming out of glm, it calls predict.glm. These two functions are similar but different. You can learn more about the differences by reading the help files:\n\n?predict.lm\n?predict.glm\n\nThere are many other versions of predict and many machine learning algorithms have a predict function.\n\n\n\n\n\n\nYou are ready to do exercises 1 - 8."
+    "title": "29  Resampling methods",
+    "section": "\n29.8 Exercises",
+    "text": "29.8 Exercises\nGenerate a set of random predictors and outcomes like this:\n\nset.seed(1996)\nn &lt;- 1000\np &lt;- 10000\nx &lt;- matrix(rnorm(n * p), n, p)\ncolnames(x) &lt;- paste(\"x\", 1:ncol(x), sep = \"_\")\ny &lt;- rbinom(n, 1, 0.5) |&gt; factor()\n\nx_subset &lt;- x[ ,sample(p, 100)]\n\n1. Because x and y are completely independent, you should not be able to predict y using x with accuracy larger than 0.5. Confirm this by running cross validation using logistic regression to fit the model. Because we have so many predictors, we selected a random sample x_subset. Use the subset when training the model. Hint: use the caret train function. The results component of the output of train shows you the accuracy. Ignore the warnings.\n2. Now instead of a random selection of predictors, we are going to search for those that are most predictive of the outcome. We can do this by comparing the values for the \\(y = 1\\) group to those in the \\(y = 0\\) group, for each predictor, using a t-test. You can perform this step as follows:\n\ndevtools::install_bioc(\"genefilter\")\ninstall.packages(\"genefilter\")\nlibrary(genefilter)\ntt &lt;- colttests(x, y)\n\nCreate a vector of the p-values and call it pvals.\n3. Create an index ind with the column numbers of the predictors that were “statistically significantly” associated with y. Use a p-value cutoff of 0.01 to define “statistically significant”. How many predictors survive this cutoff?\n4. Re-run the cross validation but after redefining x_subset to be the subset of x defined by the columns showing “statistically significant” association with y. What is the accuracy now?\n5. Re-run the cross validation again, but this time using kNN. Try out the following grid of tuning parameters: k = seq(101, 301, 25). Make a plot of the resulting accuracy.\n6. In exercises 3 and 4, we see that despite the fact that x and y are completely independent, we were able to predict y with accuracy higher than 70%. We must be doing something wrong then. What is it?\n\nThe function train estimates accuracy on the same data it uses to train the algorithm.\nWe are over-fitting the model by including 100 predictors.\nWe used the entire dataset to select the columns used in the model. This step needs to be included as part of the algorithm. The cross validation was done after this selection.\nThe high accuracy is just due to random variability.\n\n7. Advanced. Re-do the cross validation but this time include the selection step in the cross validation. The accuracy should now be close to 50%.\n8. Load the tissue_gene_expression dataset. Use the train function to predict tissue from gene expression. Use kNN. What k works best?\n9. The createResample function can be used to create bootstrap samples. For example, we can create 10 bootstrap samples for the mnist_27 dataset like this:\n\nset.seed(1995)\nindexes &lt;- createResample(mnist_27$train$y, 10)\n\nHow many times do 3, 4, and 7 appear in the first re-sampled index?\n10. We see that some numbers appear more than once and others appear no times. This must be so for each dataset to be independent. Repeat the exercise for all the re-sampled indexes."
   },
   {
     "objectID": "ml/algorithms.html#logistic-regression",
     "href": "ml/algorithms.html#logistic-regression",
-    "title": "\n29  Examples of algorithms\n",
-    "section": "\n29.2 Logistic regression",
-    "text": "29.2 Logistic regression\nThe regression approach can be extended to categorical data. In this section we first illustrate how, for binary data, one can simply assign numeric values of 0 and 1 to the outcomes \\(y\\), and apply regression as if the data were continuous. We will then point out a limitation with this approach and introduce logistic regression as a solution. Logistic regression is a specific case of a set of generalized linear models. To illustrate logistic regression, we will apply it to our previous predicting sex example defined in Section Section 25.1.\nIf we define the outcome \\(Y\\) as 1 for females and 0 for males, and \\(X\\) as the height, we are interested in the conditional probability:\n\\[\n\\mbox{Pr}( Y = 1 \\mid X = x)\n\\]\nAs an example, let’s provide a prediction for a student that is 66 inches tall. What is the conditional probability of being female if you are 66 inches tall? In our dataset, we can estimate this by rounding to the nearest inch and computing:\n\ntrain_set |&gt; \n  filter(round(height) == 66) |&gt;\n  summarize(y_hat = mean(sex == \"Female\"))\n#&gt;   y_hat\n#&gt; 1 0.347\n\nTo construct a prediction algorithm, we want to estimate the proportion of the population that is female for any given height \\(X = x\\), which we write as the conditional probability described above: \\(\\mbox{Pr}( Y = 1 | X = x)\\). Let’s see what this looks like for several values of \\(x\\) (we will remove strata of \\(x\\) with few data points):\n\nheights |&gt; \n  mutate(x = round(height)) |&gt;\n  group_by(x) |&gt;\n  filter(n() &gt;= 10) |&gt;\n  summarize(prop = mean(sex == \"Female\")) |&gt;\n  ggplot(aes(x, prop)) +\n  geom_point()\n\n\n\n\n\n\n\nSince the results from the plot above look close to linear, and it is the only approach we currently know, we will try regression. We assume that:\n\\[p(x) = \\mbox{Pr}( Y = 1 | X = x)  = \\beta_0 + \\beta_1 x\\]\nNote: because \\(p_0(x) = 1 - p_1(x)\\), we will only estimate \\(p_1(x)\\) and drop the \\(_1\\) index.\nIf we convert the factors to 0s and 1s, we can estimate \\(\\beta_0\\) and \\(\\beta_1\\) with least squares.\n\nlm_fit &lt;- mutate(train_set, y = as.numeric(sex == \"Female\")) |&gt; \n  lm(y ~ height, data = _)\n\nOnce we have estimates \\(\\hat{\\beta}_0\\) and \\(\\hat{\\beta}_1\\), we can obtain an actual prediction. Our estimate of the conditional probability \\(p(x)\\) is:\n\\[\n\\hat{p}(x) = \\hat{\\beta}_0+ \\hat{\\beta}_1 x\n\\]\nTo form a prediction, we define a decision rule: predict female if \\(\\hat{p}(x) &gt; 0.5\\). We can compare our predictions to the outcomes using:\n\np_hat &lt;- predict(lm_fit, test_set)\ny_hat &lt;- ifelse(p_hat &gt; 0.5, \"Female\", \"Male\") |&gt; factor()\nconfusionMatrix(y_hat, test_set$sex)$overall[[\"Accuracy\"]]\n#&gt; [1] 0.798\n\nWe see this method does substantially better than guessing.\n\n29.2.1 Generalized linear models\nThe function \\(\\beta_0 + \\beta_1 x\\) can take any value including negatives and values larger than 1. In fact, the estimate \\(\\hat{p}(x)\\) computed in the linear regression section does indeed become negative.\n\nheights |&gt; \n  mutate(x = round(height)) |&gt;\n  group_by(x) |&gt;\n  filter(n() &gt;= 10) |&gt;\n  summarize(prop = mean(sex == \"Female\")) |&gt;\n  ggplot(aes(x, prop)) +\n  geom_point() + \n  geom_abline(intercept = lm_fit$coef[1], slope = lm_fit$coef[2])\n\n\n\n\n\n\n\nThe range is:\n\nrange(p_hat)\n#&gt; [1] -0.578  1.262\n\nBut we are estimating a probability: \\(\\mbox{Pr}( Y = 1 \\mid X = x)\\) which is constrained between 0 and 1.\nThe idea of generalized linear models (GLM) is to 1) define a distribution of \\(Y\\) that is consistent with it’s possible outcomes and 2) find a function \\(g\\) so that \\(g(\\mbox{Pr}( Y = 1 \\mid X = x))\\) can be modeled as a linear combination of predictors. Logistic regression is the most commonly used GLM. It is an extension of linear regression that assures that the estimate of \\(\\mbox{Pr}( Y = 1 \\mid X = x)\\) is between 0 and 1. This approach makes use of the logistic transformation defined as:\n\\[ g(p) = \\log \\frac{p}{1-p}.\\]\nThis logistic transformation converts probability to log odds. As discussed in the data visualization lecture, the odds tell us how much more likely it is something will happen compared to not happening. \\(p = 0.5\\) means the odds are 1 to 1, thus the odds are 1. If \\(p = 0.75\\), the odds are 3 to 1. A nice characteristic of this transformation is that it converts probabilities to be symmetric around 0. Here is a plot of \\(g(p)\\) versus \\(p\\):\n\n#&gt; Warning: `qplot()` was deprecated in ggplot2 3.4.0.\n\n\n\n\n\n\n\nWith logistic regression, we model the conditional probability directly with:\n\\[\ng\\left\\{ \\mbox{Pr}(Y = 1 \\mid X = x) \\right\\} = \\beta_0 + \\beta_1 x\n\\]\nWith this model, we can no longer use least squares. Instead we compute the maximum likelihood estimate (MLE). You can learn more about this concept in a statistical theory textbook1.\nIn R, we can fit the logistic regression model with the function glm: generalized linear models. This function is more general than logistic regression so we need to specify the model we want through the family parameter:\n\nglm_fit &lt;- train_set |&gt; \n  mutate(y = as.numeric(sex == \"Female\")) |&gt;\n  glm(y ~ height, data = _, family = \"binomial\")\n\nWe can obtain prediction using the predict function:\n\np_hat_logit &lt;- predict(glm_fit, newdata = test_set, type = \"response\")\n\nWhen using predict with a glm object, we have to specify that we want type = \"response\" if we want the conditional probabilities, since the default is to return the logistic transformed values.\nThis model fits the data slightly better than the line:\n\n\n\n\n\n\n\n\nBecause we have an estimate \\(\\hat{p}(x)\\), we can obtain predictions:\n\ny_hat_logit &lt;- ifelse(p_hat_logit &gt; 0.5, \"Female\", \"Male\") |&gt; factor()\nconfusionMatrix(y_hat_logit, test_set$sex)$overall[[\"Accuracy\"]]\n#&gt; [1] 0.808\n\nThe resulting predictions are similar. This is because the two estimates of \\(p(x)\\) are larger than 1/2 in about the same region of x:\n\n\n\n\n\n\n\n\nBoth linear and logistic regressions provide an estimate for the conditional expectation:\n\\[\n\\mbox{E}(Y \\mid X = x)\n\\] which in the case of binary data is equivalent to the conditional probability:\n\\[\n\\mbox{Pr}(Y = 1 \\mid X = x)\n\\]\n\n29.2.2 Logistic regression with more than one predictor\nIn this section we apply logistic regression to the two or seven data introduced in Section Section 27.1. In this case, we are interested in estimating a conditional probability that depends on two variables. The standard logistic regression model in this case will assume that\n\\[\ng\\{p(x_1, x_2)\\}= g\\{\\mbox{Pr}(Y = 1 \\mid X_1 = x_1 , X_2 = x_2)\\} =\n\\beta_0 + \\beta_1 x_1 + \\beta_2 x_2\n\\]\nwith \\(g(p) = \\log \\frac{p}{1-p}\\) the logistic function described in the previous section. To fit the model we use the following code:\n\nfit_glm &lt;- glm(y ~ x_1 + x_2, data = mnist_27$train, family = \"binomial\")\np_hat_glm &lt;- predict(fit_glm, mnist_27$test, type = \"response\")\ny_hat_glm &lt;- factor(ifelse(p_hat_glm &gt; 0.5, 7, 2))\nconfusionMatrix(y_hat_glm, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;     0.75\n\nComparing to the results we obtained in Section Section 27.1, we see that logistic regression performs similarly to regression. This is not surprising, given that the estimate of \\(\\hat{p}(x_1, x_2)\\) looks similar as well:\n\np_hat &lt;- predict(fit_glm, newdata = mnist_27$true_p, type = \"response\")\nmnist_27$true_p |&gt; mutate(p_hat = p_hat) |&gt;\n  ggplot(aes(x_1, x_2,  z = p_hat, fill = p_hat)) +\n  geom_raster() +\n  scale_fill_gradientn(colors = c(\"#F8766D\",\"white\",\"#00BFC4\")) +\n  stat_contour(breaks = c(0.5), color = \"black\") \n#&gt; Warning: The following aesthetics were dropped during statistical\n#&gt; transformation: fill\n#&gt; ℹ This can happen when ggplot fails to infer the correct grouping\n#&gt;   structure in the data.\n#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a\n#&gt;   numerical variable into a factor?\n\n\n\n\n\n\n\nJust like regression, the decision rule is a line, a fact that can be corroborated mathematically since\n\\[\ng^{-1}(\\hat{\\beta}_0 + \\hat{\\beta}_1 x_1 + \\hat{\\beta}_2 x_2) = 0.5 \\implies\n\\hat{\\beta}_0 + \\hat{\\beta}_1 x_1 + \\hat{\\beta}_2 x_2 = g(0.5) = 0 \\implies\nx_2 = -\\hat{\\beta}_0/\\hat{\\beta}_2 -\\hat{\\beta}_1/\\hat{\\beta}_2 x_1\n\\]\nThus \\(x_2\\) is a linear function of \\(x_1\\). This implies that, just like regression, our logistic regression approach has no chance of capturing the non-linear nature of the true \\(p(x_1,x_2)\\). Once we move on to more complex examples, we will see that linear regression and generalized linear regression are limited and not flexible enough to be useful for most machine learning challenges. The new techniques we learn are essentially approaches to estimating the conditional probability in a way that is more flexible.\n\n\n\n\n\n\nYou can now do exeercises 9 - 11."
+    "title": "30  Examples of algorithms",
+    "section": "\n30.1 Logistic regression",
+    "text": "30.1 Logistic regression\nIn Section 28.1, we used linear regression to predict classes by fitting the model:\n\\[\np(\\mathbf{x}) = \\mbox{Pr}(Y=1 \\mid X_1=x_1 , X_2 = x_2) =\n\\beta_0 + \\beta_1 x_1 + \\beta_2 x_2\n\\] using least squares after assigning numeric values of 0 and 1 to the outcomes \\(y\\), and applied regression as if the data were continuous. A obvious problem with this approach is that \\(\\hat{p}(\\mathbf{x})\\) can be negative and larger than 1:\n\nfit_lm &lt;- lm(y ~ x_1 + x_2, data = mutate(mnist_27$train,y = ifelse(y == 7, 1, 0)))\nrange(fit_lm$fitted)\n#&gt; [1] -0.22  1.92\n\nTo avoid this, we can apply the approach described in Section 18.5 that is more appropriate for binary data. We write the model like this:\n\\[\n\\log \\frac{p(\\mathbf{x})}{1-p(\\mathbf{x})} = \\beta_0 + \\beta_1 x_1 + \\beta_2 x_2\n\\]\nWe can then find the maximum likelihood estimates (MLE) of the model parameters and predict using the estimate \\(p(\\mathbf{x})\\) to obtain an accuracy of 0.775. We see that logistic regression performs similarly to regression. This is not surprising given that the estimate of \\(\\hat{p}(\\mathbf{x})\\) looks similar as well:\n\n\n\n\n\n\n\n\nJust like regression, the decision rule is a line, a fact that can be corroborated mathematically. Defining \\(g(x) = \\log \\{x/(1-x)\\}\\), we have:\n\\[\ng^{-1}(\\hat{\\beta}_0 + \\hat{\\beta}_1 x_1 + \\hat{\\beta}_2 x_2) = 0.5 \\implies\n\\hat{\\beta}_0 + \\hat{\\beta}_1 x_1 + \\hat{\\beta}_2 x_2 = g(0.5) = 0 \\implies\nx_2 = -\\hat{\\beta}_0/\\hat{\\beta}_2 -\\hat{\\beta}_1/\\hat{\\beta}_2 x_1\n\\]\nThus, much like with regression, \\(x_2\\) is a linear function of \\(x_1\\). This implies that our logistic regression approach has no chance of capturing the non-linear nature of the true \\(p(\\mathbf{x})\\). We now describe some techniques that estimate the conditional probability in a more flexible way.\n\n\n\n\n\n\nYou are ready to do exercises 1 - 11."
   },
   {
     "objectID": "ml/algorithms.html#k-nearest-neighbors",
     "href": "ml/algorithms.html#k-nearest-neighbors",
-    "title": "\n29  Examples of algorithms\n",
-    "section": "\n29.3 k-nearest neighbors",
-    "text": "29.3 k-nearest neighbors\nWe introduced the kNN algorithm in Section Section 28.1) and demonstrated how we use cross validation to pick \\(k\\) in Section @ref(caret-cv). Here we quickly review how we fit a kNN model using the caret package. In Section @ref(caret-cv we introduced the following code to fit a kNN model:\n\ntrain_knn &lt;- train(y ~ ., method = \"knn\", \n                   data = mnist_27$train,\n                   tuneGrid = data.frame(k = seq(9, 71, 2)))\n\nWe saw that the parameter that maximized the estimated accuracy was:\n\ntrain_knn$bestTune\n#&gt;     k\n#&gt; 10 27\n\nThis model improves the accuracy over regression and logistic regression:\n\nconfusionMatrix(predict(train_knn, mnist_27$test, type = \"raw\"),\n                mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.835\n\nA plot of the estimated conditional probability shows that the kNN estimate is flexible enough and does indeed capture the shape of the true conditional probability.\n\n#&gt; Warning: The following aesthetics were dropped during statistical\n#&gt; transformation: fill\n#&gt; ℹ This can happen when ggplot fails to infer the correct grouping\n#&gt;   structure in the data.\n#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a\n#&gt;   numerical variable into a factor?\n#&gt; The following aesthetics were dropped during statistical\n#&gt; transformation: fill\n#&gt; ℹ This can happen when ggplot fails to infer the correct grouping\n#&gt;   structure in the data.\n#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a\n#&gt;   numerical variable into a factor?\n\n\n\n\n\n\n\n\n\n\n\n\n\nYou are ready to do exercises 12 - 13."
+    "title": "30  Examples of algorithms",
+    "section": "\n30.2 k-nearest neighbors",
+    "text": "30.2 k-nearest neighbors\nWe introduced the kNN algorithm in Section 29.1. In Section 29.7.1, we noted that \\(k=31\\) provided the highest accuracy in the test set. Using \\(k=31\\), we obtain an accuracy 0.825, an improvement over regression. A plot of the estimated conditional probability shows that the kNN estimate is flexible enough and does indeed capture the shape of the true conditional probability.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nYou are ready to do exercises 12 - 13."
   },
   {
     "objectID": "ml/algorithms.html#generative-models",
     "href": "ml/algorithms.html#generative-models",
-    "title": "\n29  Examples of algorithms\n",
-    "section": "\n29.4 Generative models",
-    "text": "29.4 Generative models\nWe have described how, when using squared loss, the conditional expectation/probabilities provide the best approach to developing a decision rule. In a binary case, the smallest true error we can achieve is determined by Bayes’ rule, which is a decision rule based on the true conditional probability:\n\\[\np(\\mathbf{x}) = \\mbox{Pr}(Y = 1 \\mid \\mathbf{X}=\\mathbf{x})\n\\]\nWe have described several approaches to estimating \\(p(\\mathbf{x})\\). In all these approaches, we estimate the conditional probability directly and do not consider the distribution of the predictors. In machine learning, these are referred to as discriminative approaches.\nHowever, Bayes’ theorem tells us that knowing the distribution of the predictors \\(\\mathbf{X}\\) may be useful. Methods that model the joint distribution of \\(Y\\) and \\(\\mathbf{X}\\) are referred to as generative models (we model how the entire data, \\(\\mathbf{X}\\) and \\(Y\\), are generated). We start by describing the most general generative model, Naive Bayes, and then proceed to describe two specific cases, quadratic discriminant analysis (QDA) and linear discriminant analysis (LDA).\n\n29.4.1 Naive Bayes\nRecall that Bayes rule tells us that we can rewrite \\(p(\\mathbf{x})\\) like this:\n\\[\np(\\mathbf{x}) = \\mbox{Pr}(Y = 1|\\mathbf{X}=\\mathbf{x}) = \\frac{f_{\\mathbf{X}|Y = 1}(\\mathbf{x}) \\mbox{Pr}(Y = 1)}\n{ f_{\\mathbf{X}|Y = 0}(\\mathbf{x})\\mbox{Pr}(Y = 0)  + f_{\\mathbf{X}|Y = 1}(\\mathbf{x})\\mbox{Pr}(Y = 1) }\n\\]\nwith \\(f_{\\mathbf{X}|Y = 1}\\) and \\(f_{\\mathbf{X}|Y = 0}\\) representing the distribution functions of the predictor \\(\\mathbf{X}\\) for the two classes \\(Y = 1\\) and \\(Y = 0\\). The formula implies that if we can estimate these conditional distributions of the predictors, we can develop a powerful decision rule. However, this is a big if. As we go forward, we will encounter examples in which \\(\\mathbf{X}\\) has many dimensions and we do not have much information about the distribution. In these cases, Naive Bayes will be practically impossible to implement. However, there are instances in which we have a small number of predictors (not much more than 2) and many categories in which generative models can be quite powerful. We describe two specific examples and use our previously described case studies to illustrate them.\nLet’s start with a very simple and uninteresting, yet illustrative, case: the example related to predicting sex from height.\n\nlibrary(tidyverse)\nlibrary(caret)\n\nlibrary(dslabs)\n\ny &lt;- heights$height\nset.seed(1995)\ntest_index &lt;- createDataPartition(y, times = 1, p = 0.5, list = FALSE)\ntrain_set &lt;- heights |&gt; slice(-test_index)\ntest_set &lt;- heights |&gt; slice(test_index)\n\nIn this case, the Naive Bayes approach is particularly appropriate because we know that the normal distribution is a good approximation for the conditional distributions of height given sex for both classes \\(Y = 1\\) (female) and \\(Y = 0\\) (male). This implies that we can approximate the conditional distributions \\(f_{X|Y = 1}\\) and \\(f_{X|Y = 0}\\) by simply estimating averages and standard deviations from the data:\n\nparams &lt;- train_set |&gt; \n  group_by(sex) |&gt; \n  summarize(avg = mean(height), sd = sd(height))\nparams\n#&gt; # A tibble: 2 × 3\n#&gt;   sex      avg    sd\n#&gt;   &lt;fct&gt;  &lt;dbl&gt; &lt;dbl&gt;\n#&gt; 1 Female  64.8  4.14\n#&gt; 2 Male    69.2  3.57\n\nThe prevalence, which we will denote with \\(\\pi = \\mbox{Pr}(Y = 1)\\), can be estimated from the data with:\n\npi &lt;- train_set |&gt; summarize(pi = mean(sex == \"Female\")) |&gt; pull(pi)\npi\n#&gt; [1] 0.212\n\nNow we can use our estimates of average and standard deviation to get an actual rule:\n\nx &lt;- test_set$height\n\nf0 &lt;- dnorm(x, params$avg[2], params$sd[2])\nf1 &lt;- dnorm(x, params$avg[1], params$sd[1])\n\np_hat_bayes &lt;- f1*pi / (f1*pi + f0*(1 - pi))\n\nOur Naive Bayes estimate \\(\\hat{p}(x)\\) looks a lot like our logistic regression estimate:\n\n\n\n\n\n\n\n\nIn fact, we can show that the Naive Bayes approach is similar to the logistic regression prediction mathematically. However, we leave the demonstration to a more advanced text, such as the Elements of Statistical Learning2. We can see that they are similar empirically by comparing the two resulting curves.\n\n29.4.2 Controlling prevalence\nOne useful feature of the Naive Bayes approach is that it includes a parameter to account for differences in prevalence. Using our sample, we estimated \\(f_{X|Y = 1}\\), \\(f_{X|Y = 0}\\) and \\(\\pi\\). If we use hats to denote the estimates, we can write \\(\\hat{p}(x)\\) as:\n\\[\n\\hat{p}(x)= \\frac{\\hat{f}_{X|Y = 1}(x) \\hat{\\pi}}\n{ \\hat{f}_{X|Y = 0}(x)(1-\\hat{\\pi}) + \\hat{f}_{X|Y = 1}(x)\\hat{\\pi} }\n\\]\nAs we discussed earlier, our sample has a much lower prevalence, 0.21, than the general population. So if we use the rule \\(\\hat{p}(x)&gt;0.5\\) to predict females, our accuracy will be affected due to the low sensitivity:\n\ny_hat_bayes &lt;- ifelse(p_hat_bayes &gt; 0.5, \"Female\", \"Male\")\nsensitivity(data = factor(y_hat_bayes), reference = factor(test_set$sex))\n#&gt; [1] 0.213\n\nAgain, this is because the algorithm gives more weight to specificity to account for the low prevalence:\n\nspecificity(data = factor(y_hat_bayes), reference = factor(test_set$sex))\n#&gt; [1] 0.967\n\nThis is due mainly to the fact that \\(\\hat{\\pi}\\) is substantially less than 0.5, so we tend to predict Male more often. It makes sense for a machine learning algorithm to do this in our sample because we do have a higher percentage of males. But if we were to extrapolate this to a general population, our overall accuracy would be affected by the low sensitivity.\nThe Naive Bayes approach gives us a direct way to correct this since we can simply force \\(\\hat{\\pi}\\) to be whatever value we want it to be. So to balance specificity and sensitivity, instead of changing the cutoff in the decision rule, we could simply change \\(\\hat{\\pi}\\) to 0.5 like this:\n\np_hat_bayes_unbiased &lt;- f1 * 0.5 / (f1 * 0.5 + f0 * (1 - 0.5)) \ny_hat_bayes_unbiased &lt;- ifelse(p_hat_bayes_unbiased&gt; 0.5, \"Female\", \"Male\")\n\nNote the difference in sensitivity with a better balance:\n\nsensitivity(factor(y_hat_bayes_unbiased), factor(test_set$sex))\n#&gt; [1] 0.693\nspecificity(factor(y_hat_bayes_unbiased), factor(test_set$sex))\n#&gt; [1] 0.832\n\nThe new rule also gives us a very intuitive cutoff between 66-67, which is about the middle of the female and male average heights:\n\nqplot(x, p_hat_bayes_unbiased, geom = \"line\") + \n  geom_hline(yintercept = 0.5, lty = 2) + \n  geom_vline(xintercept = 67, lty = 2)\n\n\n\n\n\n\n\n\n29.4.3 Quadratic discriminant analysis\nQuadratic Discriminant Analysis (QDA) is a version of Naive Bayes in which we assume that the distributions \\(p_{\\mathbf{X}|Y = 1}(x)\\) and \\(p_{\\mathbf{X}|Y = 0}(\\mathbf{x})\\) are multivariate normal. The simple example we described in the previous section is actually QDA. Let’s now look at a slightly more complicated case: the 2 or 7 example.\nIn this case, we have two predictors so we assume each one is bivariate normal. This implies that we need to estimate two averages, two standard deviations, and a correlation for each case \\(Y = 1\\) and \\(Y = 0\\). Once we have these, we can approximate the distributions \\(f_{X_1,X_2|Y = 1}\\) and \\(f_{X_1, X_2|Y = 0}\\). We can easily estimate parameters from the data:\n\nparams &lt;- mnist_27$train |&gt; \n  group_by(y) |&gt; \n  summarize(avg_1 = mean(x_1), avg_2 = mean(x_2), \n            sd_1= sd(x_1), sd_2 = sd(x_2), \n            r = cor(x_1, x_2))\nparams\n#&gt; # A tibble: 2 × 6\n#&gt;   y     avg_1 avg_2   sd_1   sd_2     r\n#&gt;   &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt;  &lt;dbl&gt;  &lt;dbl&gt; &lt;dbl&gt;\n#&gt; 1 2     0.129 0.283 0.0702 0.0578 0.401\n#&gt; 2 7     0.234 0.288 0.0719 0.105  0.455\n\nHere we provide a visual way of showing the approach. We plot the data and use contour plots to give an idea of what the two estimated normal densities look like (we show the curve representing a region that includes 95% of the points):\n\nmnist_27$train |&gt; mutate(y = factor(y)) |&gt; \n  ggplot(aes(x_1, x_2, fill = y, color = y)) + \n  geom_point(show.legend = FALSE) + \n  stat_ellipse(type = \"norm\", lwd = 1.5)\n\n\n\n\n\n\n\nThis defines the following estimate of \\(f(x_1, x_2)\\).\nWe can use the train function from the caret package to fit the model and obtain predictors:\n\nlibrary(caret)\ntrain_qda &lt;- train(y ~ ., method = \"qda\", data = mnist_27$train)\n\nWe see that we obtain relatively good accuracy:\n\ny_hat &lt;- predict(train_qda, mnist_27$test)\nconfusionMatrix(y_hat, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;     0.82\n\nThe estimated conditional probability looks relatively good, although it does not fit as well as the kernel smoothers:\n\n\n\n\n\n\n\n\nOne reason QDA does not work as well as the kernel methods is perhaps because the assumption of normality does not quite hold. Although for the 2s it seems reasonable, for the 7s it does seem to be off. Notice the slight curvature in the points for the 7s:\n\nmnist_27$train |&gt; mutate(y = factor(y)) |&gt; \n  ggplot(aes(x_1, x_2, fill = y, color = y)) + \n  geom_point(show.legend = FALSE) + \n  stat_ellipse(type = \"norm\") +\n  facet_wrap(~y)\n\n\n\n\n\n\n\nQDA can work well here, but it becomes harder to use as the number of predictors increases. Here we have 2 predictors and had to compute 4 means, 4 SDs, and 2 correlations. How many parameters would we have if instead of 2 predictors, we had 10? The main problem comes from estimating correlations for 10 predictors. With 10, we have 45 correlations for each class. In general, the formula is \\(K\\times p(p-1)/2\\), which gets big fast. Once the number of parameters approaches the size of our data, the method becomes impractical due to overfitting.\n\n29.4.4 Linear discriminant analysis\nA relatively simple solution to the problem of having too many parameters is to assume that the correlation structure is the same for all classes, which reduces the number of parameters we need to estimate.\nIn this case, we would compute just one pair of standard deviations and one correlation,  and the distributions looks like this:\n\n\n\n\n\n\n\n\nNow the size of the ellipses as well as the angle are the same. This is because they have the same standard deviations and correlations.\nWe can fit the LDA model using caret:\n\ntrain_lda &lt;- train(y ~ ., method = \"lda\", data = mnist_27$train)\ny_hat &lt;- predict(train_lda, mnist_27$test)\nconfusionMatrix(y_hat, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;     0.75\n\nWhen we force this assumption, we can show mathematically that the boundary is a line, just as with logistic regression. For this reason, we call the method linear discriminant analysis (LDA). Similarly, for QDA, we can show that the boundary must be a quadratic function.\n\n#&gt; Warning: The following aesthetics were dropped during statistical\n#&gt; transformation: fill\n#&gt; ℹ This can happen when ggplot fails to infer the correct grouping\n#&gt;   structure in the data.\n#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a\n#&gt;   numerical variable into a factor?\n#&gt; The following aesthetics were dropped during statistical\n#&gt; transformation: fill\n#&gt; ℹ This can happen when ggplot fails to infer the correct grouping\n#&gt;   structure in the data.\n#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a\n#&gt;   numerical variable into a factor?\n\n\n\n\n\n\n\nIn the case of LDA, the lack of flexibility does not permit us to capture the non-linearity in the true conditional probability function.\n\n29.4.5 Connection to distance\nThe normal density is:\n\\[\np(x) = \\frac{1}{\\sqrt{2\\pi} \\sigma} \\exp\\left\\{ - \\frac{(x-\\mu)^2}{\\sigma^2}\\right\\}\n\\]\nIf we remove the constant \\(1/(\\sqrt{2\\pi} \\sigma)\\) and then take the log, we get:\n\\[\n- \\frac{(x-\\mu)^2}{\\sigma^2}\n\\]\nwhich is the negative of a distance squared scaled by the standard deviation. For higher dimensions, the same is true except the scaling is more complex and involves correlations.\n\n29.4.6 Case study: more than three classes\nWe can generate an example with three categories like this:\n\nif (!exists(\"mnist\")) mnist &lt;- read_mnist()\nset.seed(3456)\nindex_127 &lt;- sample(which(mnist$train$labels %in% c(1,2,7)), 2000)\ny &lt;- mnist$train$labels[index_127] \nx &lt;- mnist$train$images[index_127,]\nindex_train &lt;- createDataPartition(y, p = 0.8, list = FALSE)\n## get the quadrants\nrow_column &lt;- expand.grid(row = 1:28, col = 1:28) \nupper_left_ind &lt;- which(row_column$col &lt;= 14 & row_column$row &lt;= 14)\nlower_right_ind &lt;- which(row_column$col &gt; 14 & row_column$row &gt; 14)\n## binarize the values. Above 200 is ink, below is no ink\nx &lt;- x &gt; 200 \n## proportion of pixels in lower right quadrant\nx &lt;- cbind(rowSums(x[ ,upper_left_ind])/rowSums(x), \n           rowSums(x[ ,lower_right_ind])/rowSums(x)) \n##save data\ntrain_set &lt;- data.frame(y = factor(y[index_train]),\n                        x_1 = x[index_train,1], x_2 = x[index_train,2])\ntest_set &lt;- data.frame(y = factor(y[-index_train]),\n                       x_1 = x[-index_train,1], x_2 = x[-index_train,2])\n\nHere is the training data:\n\ntrain_set |&gt; ggplot(aes(x_1, x_2, color = y)) + geom_point()\n\n\n\n\n\n\n\nWe can use the caret package to train the QDA model:\n\ntrain_qda &lt;- train(y ~ ., method = \"qda\", data = train_set)\n\nNow we estimate three conditional probabilities (although they have to add to 1):\n\npredict(train_qda, test_set, type = \"prob\") |&gt; head()\n#&gt;        1       2       7\n#&gt; 1 0.7655 0.23043 0.00405\n#&gt; 2 0.2031 0.72514 0.07175\n#&gt; 3 0.5396 0.45909 0.00132\n#&gt; 4 0.0393 0.09419 0.86655\n#&gt; 5 0.9600 0.00936 0.03063\n#&gt; 6 0.9865 0.00724 0.00623\n\nOur predictions are one of the three classes:\n\npredict(train_qda, test_set) |&gt; head()\n#&gt; [1] 1 2 1 7 1 1\n#&gt; Levels: 1 2 7\n\nThe confusion matrix is therefore a 3 by 3 table:\n\nconfusionMatrix(predict(train_qda, test_set), test_set$y)$table\n#&gt;           Reference\n#&gt; Prediction   1   2   7\n#&gt;          1 111   9  11\n#&gt;          2  10  86  21\n#&gt;          7  21  28 102\n\nThe accuracy is 0.7493734\nNote that for sensitivity and specificity, we have a pair of values for each class. To define these terms, we need a binary outcome. We therefore have three columns: one for each class as the positives and the other two as the negatives.\nTo visualize what parts of the region are called 1, 2, and 7 we now need three colors:\n\n\n\n\n\n\n\n\nThe accuracy for LDA, 0.6290727, is much worse because the model is more rigid. This is what the decision rule looks like:\n\n\n\n\n\n\n\n\nThe results for kNN\n\ntrain_knn &lt;- train(y ~ ., method = \"knn\", data = train_set,\n                   tuneGrid = data.frame(k = seq(15, 51, 2)))\n\nare much better with an accuracy of 0.7493734. The decision rule looks like this:\n\n\n\n\n\n\n\n\nNote that one of the limitations of generative models here is due to the lack of fit of the normal assumption, in particular for class 1.\n\ntrain_set |&gt; mutate(y = factor(y)) |&gt; \n  ggplot(aes(x_1, x_2, fill = y, color = y)) + \n  geom_point(show.legend = FALSE) + \n  stat_ellipse(type = \"norm\") \n\n\n\n\n\n\n\nGenerative models can be very powerful, but only when we are able to successfully approximate the joint distribution of predictors conditioned on each class.\n:::{.callout-note}\nYou are now ready to do exercises 14-22."
+    "title": "30  Examples of algorithms",
+    "section": "\n30.3 Generative models",
+    "text": "30.3 Generative models\nWe have described how, when using squared loss, the conditional expectation provides the best approach to developing a decision rule. In a binary case, the smallest true error we can achieve is determined by Bayes’ rule, which is a decision rule based on the true conditional probability:\n\\[\np(\\mathbf{x}) = \\mbox{Pr}(Y = 1 \\mid \\mathbf{X}=\\mathbf{x})\n\\]\nWe have described several approaches to estimating \\(p(\\mathbf{x})\\). In all these approaches, we estimate the conditional probability directly and do not consider the distribution of the predictors. In machine learning, these are referred to as discriminative approaches.\nHowever, Bayes’ theorem tells us that knowing the distribution of the predictors \\(\\mathbf{X}\\) may be useful. Methods that model the joint distribution of \\(Y\\) and \\(\\mathbf{X}\\) are referred to as generative models (we model how the entire data, \\(\\mathbf{X}\\) and \\(Y\\), are generated). We start by describing the most general generative model, Naive Bayes, and then proceed to describe two specific cases, quadratic discriminant analysis (QDA) and linear discriminant analysis (LDA).\n\n30.3.1 Naive Bayes\nRecall that Bayes rule tells us that we can rewrite \\(p(\\mathbf{x})\\) as follows:\n\\[\np(\\mathbf{x}) = \\mbox{Pr}(Y = 1|\\mathbf{X}=\\mathbf{x}) = \\frac{f_{\\mathbf{X}|Y = 1}(\\mathbf{x}) \\mbox{Pr}(Y = 1)}\n{ f_{\\mathbf{X}|Y = 0}(\\mathbf{x})\\mbox{Pr}(Y = 0)  + f_{\\mathbf{X}|Y = 1}(\\mathbf{x})\\mbox{Pr}(Y = 1) }\n\\]\nwith \\(f_{\\mathbf{X}|Y = 1}\\) and \\(f_{\\mathbf{X}|Y = 0}\\) representing the distribution functions of the predictor \\(\\mathbf{X}\\) for the two classes \\(Y = 1\\) and \\(Y = 0\\). The formula implies that if we can estimate these conditional distributions of the predictors, we can develop a powerful decision rule. However, this is a big if.\nAs we go forward, we will encounter examples in which \\(\\mathbf{X}\\) has many dimensions and we do not have much information about the distribution. In these cases, Naive Bayes will be practically impossible to implement. However, there are instances in which we have a small number of predictors (not much more than 2) and many categories in which generative models can be quite powerful. We describe two specific examples and use our previously described case studies to illustrate them.\nLet’s start with a very simple and uninteresting, yet illustrative, case: the example related to predicting sex from height.\n\nset.seed(1995)\ny &lt;- heights$height\ntest_index &lt;- createDataPartition(y, times = 1, p = 0.5, list = FALSE)\ntrain_set &lt;- heights |&gt; slice(-test_index)\ntest_set &lt;- heights |&gt; slice(test_index)\n\nIn this case, the Naive Bayes approach is particularly appropriate because we know that the normal distribution is a good approximation for the conditional distributions of height given sex for both classes \\(Y = 1\\) (female) and \\(Y = 0\\) (male). This implies that we can approximate the conditional distributions \\(f_{X|Y = 1}\\) and \\(f_{X|Y = 0}\\) by simply estimating averages and standard deviations from the data:\n\nparams &lt;- train_set |&gt; group_by(sex) |&gt; summarize(avg = mean(height), sd = sd(height))\nparams\n#&gt; # A tibble: 2 × 3\n#&gt;   sex      avg    sd\n#&gt;   &lt;fct&gt;  &lt;dbl&gt; &lt;dbl&gt;\n#&gt; 1 Female  64.8  4.14\n#&gt; 2 Male    69.2  3.57\n\nThe prevalence, which we will denote with \\(\\pi = \\mbox{Pr}(Y = 1)\\), can be estimated from the data with:\n\npi &lt;- train_set |&gt; summarize(pi = mean(sex == \"Female\")) |&gt; pull(pi)\npi\n#&gt; [1] 0.212\n\nNow we can use our estimates of average and standard deviation to get an actual rule:\n\nx &lt;- test_set$height\nf0 &lt;- dnorm(x, params$avg[2], params$sd[2])\nf1 &lt;- dnorm(x, params$avg[1], params$sd[1])\np_hat_bayes &lt;- f1*pi / (f1*pi + f0*(1 - pi))\n\nOur Naive Bayes estimate \\(\\hat{p}(x)\\) looks a lot like a logistic regression estimate:\n\n\n\n\n\n\n\n\nIn fact, we can show that the Naive Bayes approach is similar to the logistic regression prediction mathematically. However, we leave the demonstration to a more advanced text, such as the Elements of Statistical Learning1. We can see that they are similar empirically by comparing the two resulting curves.\n\n30.3.2 Controlling prevalence\nOne useful feature of the Naive Bayes approach is that it includes a parameter to account for differences in prevalence. Using our sample, we estimated \\(f_{X|Y = 1}\\), \\(f_{X|Y = 0}\\) and \\(\\pi\\). If we use hats to denote the estimates, we can write \\(\\hat{p}(x)\\) as:\n\\[\n\\hat{p}(x)= \\frac{\\hat{f}_{X|Y = 1}(x) \\hat{\\pi}}\n{ \\hat{f}_{X|Y = 0}(x)(1-\\hat{\\pi}) + \\hat{f}_{X|Y = 1}(x)\\hat{\\pi} }\n\\]\nAs we discussed earlier, our sample has a much lower prevalence, 0.21, than the general population. So if we use the rule \\(\\hat{p}(x) &gt; 0.5\\) to predict females, our accuracy will be affected due to the low sensitivity:\n\ny_hat_bayes &lt;- ifelse(p_hat_bayes &gt; 0.5, \"Female\", \"Male\")\nsensitivity(data = factor(y_hat_bayes), reference = factor(test_set$sex))\n#&gt; [1] 0.213\n\nAgain, this is because the algorithm gives more weight to specificity to account for the low prevalence:\n\nspecificity(data = factor(y_hat_bayes), reference = factor(test_set$sex))\n#&gt; [1] 0.967\n\nThis is mainly due to the fact that \\(\\hat{\\pi}\\) is substantially less than 0.5, so we tend to predict Male more often. It makes sense for a machine learning algorithm to do this in our sample because we do have a higher percentage of males. But if we were to extrapolate this to a general population, our overall accuracy would be affected by the low sensitivity.\nThe Naive Bayes approach gives us a direct way to correct this since we can simply force \\(\\hat{\\pi}\\) to be whatever value we want it to be. So to balance specificity and sensitivity, instead of changing the cutoff in the decision rule, we could simply change \\(\\hat{\\pi}\\) to 0.5 like this:\n\np_hat_bayes_unbiased &lt;- f1 * 0.5 / (f1 * 0.5 + f0 * (1 - 0.5)) \ny_hat_bayes_unbiased &lt;- ifelse(p_hat_bayes_unbiased &gt; 0.5, \"Female\", \"Male\")\n\nNote the difference in sensitivity with a better balance:\n\nsensitivity(factor(y_hat_bayes_unbiased), factor(test_set$sex))\n#&gt; [1] 0.693\nspecificity(factor(y_hat_bayes_unbiased), factor(test_set$sex))\n#&gt; [1] 0.832\n\nThe new rule also gives us a very intuitive cutoff between 66-67, which is about the middle of the female and male average heights:\n\nplot(x, p_hat_bayes_unbiased)\nabline(h = 0.5, lty = 2) + \nabline(v = 67, lty = 2)\n#&gt; integer(0)\n\n\n\n\n\n\n\n\n30.3.3 Quadratic discriminant analysis\nQuadratic Discriminant Analysis (QDA) is a version of Naive Bayes in which we assume that the distributions \\(p_{\\mathbf{X}|Y = 1}(x)\\) and \\(p_{\\mathbf{X}|Y = 0}(\\mathbf{x})\\) are multivariate normal. The simple example we described in the previous section is actually QDA. Let’s now look at a slightly more complicated case: the 2 or 7 example.\nIn this example, we have two predictors so we assume each one is bivariate normal. This implies that we need to estimate two averages, two standard deviations, and a correlation for each case \\(Y = 1\\) and \\(Y = 0\\). Once we have these, we can approximate the distributions \\(f_{X_1,X_2|Y = 1}\\) and \\(f_{X_1, X_2|Y = 0}\\). We can easily estimate parameters from the data:\n\nparams &lt;- mnist_27$train |&gt; \n  group_by(y) |&gt; \n  summarize(avg_1 = mean(x_1), avg_2 = mean(x_2), \n            sd_1= sd(x_1), sd_2 = sd(x_2), \n            r = cor(x_1, x_2))\nparams\n#&gt; # A tibble: 2 × 6\n#&gt;   y     avg_1 avg_2   sd_1   sd_2     r\n#&gt;   &lt;fct&gt; &lt;dbl&gt; &lt;dbl&gt;  &lt;dbl&gt;  &lt;dbl&gt; &lt;dbl&gt;\n#&gt; 1 2     0.136 0.287 0.0670 0.0600 0.415\n#&gt; 2 7     0.238 0.290 0.0745 0.104  0.468\n\nWith these estimates in place, all we need are the prevalence \\(\\pi\\) to compute:\n\\[\n\\hat{p}(\\mathbf{x})= \\frac{\\hat{f}_{\\mathbf{X}|Y = 1}(\\mathbf{x}) \\hat{\\pi}}\n{ \\hat{f}_{\\mathbf{X}|Y = 0}(x)(1-\\hat{\\pi}) + \\hat{f}_{\\mathbf{X}|Y = 1}(\\mathbf{x})\\hat{\\pi} }\n\\]\nNote that the densities \\(f\\) are bivariate normal distributions. Here we provide a visual way of showing the approach. We plot the data and use contour plots to give an idea of what the two estimated normal densities look like (we show the curve representing a region that includes 95% of the points):\n\n\n\n\n\n\n\n\nWe can fit QDA using the qda function the MASS package:\n\ntrain_qda &lt;- MASS::qda(y ~ ., data = mnist_27$train)\ny_hat &lt;- predict(train_qda, mnist_27$test)$class\n\nWe see that we obtain relatively good accuracy:\n\nconfusionMatrix(y_hat, mnist_27$test$y)$overall[\"Accuracy\"] \n#&gt; Accuracy \n#&gt;    0.815\n\nThe conditional probability looks relatively good, although it does not fit as well as the kernel smoothers:\n\n\n\n\n\n\n\n\nOne reason QDA does not work as well as the kernel methods is because the assumption of normality does not quite hold. Although for the 2s it seems reasonable, for the 7s it does seem to be off. Notice the slight curvature in the points for the 7s:\n\nmnist_27$train |&gt; mutate(y = factor(y)) |&gt; \n  ggplot(aes(x_1, x_2, fill = y, color = y)) + \n  geom_point(show.legend = FALSE) + \n  stat_ellipse(type = \"norm\") +\n  facet_wrap(~y)\n\n\n\n\n\n\n\nQDA can work well here, but it becomes harder to use as the number of predictors increases. Here we have 2 predictors and had to compute 4 means, 4 SDs, and 2 correlations. Notice that if we have 10 predictors, we have 45 correlations for each class. In general, the formula is \\(K\\times p(p-1)/2\\), which gets big fast. Once the number of parameters approaches the size of our data, the method becomes impractical due to overfitting.\n\n30.3.4 Linear discriminant analysis\nA relatively simple solution to QDA’s problem of having too many parameters is to assume that the correlation structure is the same for all classes, which reduces the number of parameters we need to estimate. In this case, the the distributions looks like this:\n\n\n\n\n\n\n\n\nWe can LDA using the MASS lda function:\nNow the size of the ellipses as well as the angles are the same. This is because they are assumed to have the same standard deviations and correlations. Although this added constraint lowers the number of parameters, the rigidity lowers our accuracy to:\n\nconfusionMatrix(y_hat, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.775\n\nWhen we force this assumption, we can show mathematically that the boundary is a line, just as with logistic regression. For this reason, we call the method linear discriminant analysis (LDA). Similarly, for QDA, we can show that the boundary must be a quadratic function.\n\n\n\n\n\n\n\n\nIn the case of LDA, the lack of flexibility does not permit us to capture the non-linearity in the true conditional probability function.\n\n30.3.5 Connection to distance\nThe normal density is:\n\\[\nf(x) = \\frac{1}{\\sqrt{2\\pi} \\sigma} \\exp\\left\\{ - \\frac{(x-\\mu)^2}{\\sigma^2}\\right\\}\n\\]\nIf we remove the constant \\(1/(\\sqrt{2\\pi} \\sigma)\\) and then take the log, we get:\n\\[\n- \\frac{(x-\\mu)^2}{\\sigma^2}\n\\]\nwhich is the negative of a distance squared scaled by the standard deviation. For higher dimensions, the same is true except the scaling is more complex and involves correlations.\n\n\n\n\n\n\nYou are now ready to do exercises 14-22."
   },
   {
     "objectID": "ml/algorithms.html#sec-trees",
     "href": "ml/algorithms.html#sec-trees",
-    "title": "\n29  Examples of algorithms\n",
-    "section": "\n29.5 Classification and regression trees (CART)",
-    "text": "29.5 Classification and regression trees (CART)\n\n29.5.1 The curse of dimensionality\nWe described how methods such as LDA and QDA are not meant to be used with many predictors \\(p\\) because the number of parameters that we need to estimate becomes too large. For example, with the digits example \\(p = 784\\), we would have over 600,000 parameters with LDA, and we would multiply that by the number of classes for QDA. Kernel methods such as kNN or local regression do not have model parameters to estimate. However, they also face a challenge when multiple predictors are used due to what is referred to as the curse of dimensionality. The dimension here refers to the fact that when we have \\(p\\) predictors, the distance between two observations is computed in \\(p\\)-dimensional space.\nA useful way of understanding the curse of dimensionality is by considering how large we have to make a span/neighborhood/window to include a given percentage of the data. Remember that with larger neighborhoods, our methods lose flexibility.\nFor example, suppose we have one continuous predictor with equally spaced points in the [0,1] interval and we want to create windows that include 1/10th of data. Then it’s easy to see that our windows have to be of size 0.1:\n\n\n\n\n\n\n\n\nNow, for two predictors, if we decide to keep the neighborhood just as small, 10% for each dimension, we include only 1 point. If we want to include 10% of the data, then we need to increase the size of each side of the square to \\(\\sqrt{.10} \\approx .316\\):\n\n\n\n\n\n\n\n\nUsing the same logic, if we want to include 10% of the data in a three-dimensional space, then the side of each cube is \\(\\sqrt[3]{.10} \\approx 0.464\\). In general, to include 10% of the data in a case with \\(p\\) dimensions, we need an interval with each side of size \\(\\sqrt[p]{.10}\\) of the total. This proportion gets close to 1 quickly, and if the proportion is 1 it means we include all the data and are no longer smoothing.\n\nlibrary(tidyverse)\np &lt;- 1:100\nqplot(p, .1^(1/p), ylim = c(0,1))\n\n\n\n\n\n\n\nBy the time we reach 100 predictors, the neighborhood is no longer very local, as each side covers almost the entire dataset.\nHere we look at a set of elegant and versatile methods that adapt to higher dimensions and also allow these regions to take more complex shapes while still producing models that are interpretable. These are very popular, well-known and studied methods. We will concentrate on regression and decision trees and their extension to random forests.\n\n29.5.2 CART motivation\nTo motivate this section, we will use a new dataset that includes the breakdown of the composition of olive oil into 8 fatty acids:\n\nlibrary(tidyverse)\nlibrary(dslabs)\nnames(olive)\n#&gt;  [1] \"region\"      \"area\"        \"palmitic\"    \"palmitoleic\"\n#&gt;  [5] \"stearic\"     \"oleic\"       \"linoleic\"    \"linolenic\"  \n#&gt;  [9] \"arachidic\"   \"eicosenoic\"\n\nFor illustrative purposes, we will try to predict the region using the fatty acid composition values as predictors.\n\ntable(olive$region)\n#&gt; \n#&gt; Northern Italy       Sardinia Southern Italy \n#&gt;            151             98            323\n\nWe remove the area column because we won’t use it as a predictor.\n\nolive &lt;- select(olive, -area)\n\nLet’s very quickly try to predict the region using kNN:\n\nlibrary(caret)\nfit &lt;- train(region ~ .,  method = \"knn\", \n             tuneGrid = data.frame(k = seq(1, 15, 2)), \n             data = olive)\nggplot(fit)\n\n\n\n\n\n\n\nWe see that using just one neighbor, we can predict relatively well. However, a bit of data exploration reveals that we should be able to do even better. For example, if we look at the distribution of each predictor stratified by region we see that eicosenoic is only present in Southern Italy and that linoleic separates Northern Italy from Sardinia.\n\n\n\n\n\n\n\n\nThis implies that we should be able to build an algorithm that predicts perfectly! We can see this clearly by plotting the values for eicosenoic and linoleic.\n\n\n\n\n\n\n\n\nIn Section Section 20.5 we define predictor spaces. The predictor space here consists of eight-dimensional points with values between 0 and 100. In the plot above, we show the space defined by the two predictors eicosenoic and linoleic, and, by eye, we can construct a prediction rule that partitions the predictor space so that each partition contains only outcomes of a one category. This in turn can be used to define an algorithm with perfect accuracy. Specifically, we define the following decision rule. If eicosenoic is larger than 0.065, predict Southern Italy. If not, then if linoleic is larger than \\(10.535\\), predict Sardinia, and if lower, predict Northern Italy. We can draw this decision tree like this:\n\n\n\n\n\n\n\n\nDecision trees like this are often used in practice. For example, to decide on a person’s risk of poor outcome after having a heart attack, doctors use the following:\n\n\n\n\n\n\n\n\n(Source: Walton 2010 Informal Logic, Vol. 30, No. 2, pp. 159-1843.)\nA tree is basically a flow chart of yes or no questions. The general idea of the methods we are describing is to define an algorithm that uses data to create these trees with predictions at the ends, referred to as nodes. Regression and decision trees operate by predicting an outcome variable \\(Y\\) by partitioning the predictors.\n\n29.5.3 Regression trees\nWhen the outcome is continuous, we call the method a regression tree. To introduce regression trees, we will use the 2008 poll data used in previous sections to describe the basic idea of how we build these algorithms. As with other machine learning algorithms, we will try to estimate the conditional expectation \\(f(x) = \\mbox{E}(Y | X = x)\\) with \\(Y\\) the poll margin and \\(x\\) the day.\n\nqplot(day, margin, data = polls_2008)\n\n\n\n\n\n\n\nThe general idea here is to build a decision tree and, at the end of each node, obtain a predictor \\(\\hat{y}\\). A mathematical way to describe this is to say that we are partitioning the predictor space into \\(J\\) non-overlapping regions, \\(R_1, R_2, \\ldots, R_J\\), and then for any predictor \\(x\\) that falls within region \\(R_j\\), estimate \\(f(x)\\) with the average of the training observations \\(y_i\\) for which the associated predictor \\(x_i\\) is also in \\(R_j\\).\nBut how do we decide on the partition \\(R_1, R_2, \\ldots, R_J\\) and how do we choose \\(J\\)? Here is where the algorithm gets a bit complicated.\nRegression trees create partitions recursively. We start the algorithm with one partition, the entire predictor space. In our simple first example, this space is the interval [-155, 1]. But after the first step we will have two partitions. After the second step we will split one of these partitions into two and will have three partitions, then four, then five, and so on. We describe how we pick the partition to further partition, and when to stop, later.\nOnce we select a partition \\(\\mathbf{x}\\) to split in order to create the new partitions, we find a predictor \\(j\\) and value \\(s\\) that define two new partitions, which we will call \\(R_1(j,s)\\) and \\(R_2(j,s)\\), that split our observations in the current partition by asking if \\(x_j\\) is bigger than \\(s\\):\n\\[\nR_1(j,s) = \\{\\mathbf{x} \\mid x_j &lt; s\\} \\mbox{  and  } R_2(j,s) = \\{\\mathbf{x} \\mid x_j \\geq s\\}\n\\]\nIn our current example we only have one predictor, so we will always choose \\(j = 1\\), but in general this will not be the case. Now, after we define the new partitions \\(R_1\\) and \\(R_2\\), and we decide to stop the partitioning, we compute predictors by taking the average of all the observations \\(y\\) for which the associated \\(\\mathbf{x}\\) is in \\(R_1\\) and \\(R_2\\). We refer to these two as \\(\\hat{y}_{R_1}\\) and \\(\\hat{y}_{R_2}\\) respectively.\nBut how do we pick \\(j\\) and \\(s\\)? Basically we find the pair that minimizes the residual sum of squares (RSS):\n\\[\n\\sum_{i:\\, x_i \\in R_1(j,s)} (y_i - \\hat{y}_{R_1})^2 +\n\\sum_{i:\\, x_i \\in R_2(j,s)} (y_i - \\hat{y}_{R_2})^2\n\\]\nThis is then applied recursively to the new regions \\(R_1\\) and \\(R_2\\). We describe how we stop later, but once we are done partitioning the predictor space into regions, in each region a prediction is made using the observations in that region.\nLet’s take a look at what this algorithm does on the 2008 presidential election poll data. We will use the rpart function in the rpart package.\n\nlibrary(rpart)\nfit &lt;- rpart(margin ~ ., data = polls_2008)\n\nHere, there is only one predictor. Thus we do not have to decide which predictor \\(j\\) to split by, we simply have to decide what value \\(s\\) we use to split. We can visually see where the splits were made:\n\nplot(fit, margin = 0.1)\ntext(fit, cex = 0.75)\n\n\n\n\n\n\n\n\n\nThe first split is made on day 39.5. One of those regions is then split at day 86.5. The two resulting new partitions are split on days 49.5 and 117.5, respectively, and so on. We end up with 8 partitions. The final estimate \\(\\hat{f}(x)\\) looks like this:\n\npolls_2008 |&gt; \n  mutate(y_hat = predict(fit)) |&gt; \n  ggplot() +\n  geom_point(aes(day, margin)) +\n  geom_step(aes(day, y_hat), col = \"red\")\n\n\n\n\n\n\n\nNote that the algorithm stopped partitioning at 8. Now we explain how this decision is made.\nFirst we need to define the term complexity parameter (cp). Every time we split and define two new partitions, our training set RSS decreases. This is because with more partitions, our model has more flexibility to adapt to the training data. In fact, if you split until every point is its own partition, then RSS goes all the way down to 0 since the average of one value is that same value. To avoid this, the algorithm sets a minimum for how much the RSS must improve for another partition to be added. This parameter is referred to as the complexity parameter (cp). The RSS must improve by a factor of cp for the new partition to be added. Large values of cp will therefore force the algorithm to stop earlier which results in fewer nodes.\nHowever, cp is not the only parameter used to decide if we should partition a current partition or not. Another common parameter is the minimum number of observations required in a partition before partitioning it further. The argument used in the rpart function is minsplit and the default is 20. The rpart implementation of regression trees also permits users to determine a minimum number of observations in each node. The argument is minbucket and defaults to round(minsplit/3).\nAs expected, if we set cp = 0 and minsplit = 2, then our prediction is as flexible as possible and our predictor is our original data:\n\nfit &lt;- rpart(margin ~ ., data = polls_2008, \n             control = rpart.control(cp = 0, minsplit = 2))\npolls_2008 |&gt; \n  mutate(y_hat = predict(fit)) |&gt; \n  ggplot() +\n  geom_point(aes(day, margin)) +\n  geom_step(aes(day, y_hat), col = \"red\")\n\n\n\n\n\n\n\nIntuitively we know that this is not a good approach as it will generally result in over-training. These cp, minsplit, and minbucket, three parameters can be used to control the variability of the final predictors. The larger these values are the more data is averaged to compute a predictor and thus reduce variability. The drawback is that it restricts flexibility.\nSo how do we pick these parameters? We can use cross validation, described in Chapter Chapter 28, just like with any tuning parameter. Here is an example of using cross validation to choose cp.\n\nlibrary(caret)\ntrain_rpart &lt;- train(margin ~ ., \n                     method = \"rpart\",\n                     tuneGrid = data.frame(cp = seq(0, 0.05, len = 25)),\n                     data = polls_2008)\nggplot(train_rpart)\n\n\n\n\n\n\n\nTo see the resulting tree, we access the finalModel and plot it:\n\nplot(train_rpart$finalModel, margin = 0.1)\ntext(train_rpart$finalModel, cex = 0.75)\n\n\n\n\n\n\n\n\n\nAnd because we only have one predictor, we can actually plot \\(\\hat{f}(x)\\):\n\npolls_2008 |&gt; \n  mutate(y_hat = predict(train_rpart)) |&gt; \n  ggplot() +\n  geom_point(aes(day, margin)) +\n  geom_step(aes(day, y_hat), col = \"red\")\n\n\n\n\n\n\n\nNote that if we already have a tree and want to apply a higher cp value, we can use the prune function. We call this pruning a tree because we are snipping off partitions that do not meet a cp criterion. We previously created a tree that used a cp = 0 and saved it to fit. We can prune it like this:\n\npruned_fit &lt;- prune(fit, cp = 0.01)\n\n\n29.5.4 Classification (decision) trees\nClassification trees, or decision trees, are used in prediction problems where the outcome is categorical. We use the same partitioning principle with some differences to account for the fact that we are now working with a categorical outcome.\nThe first difference is that we form predictions by calculating which class is the most common among the training set observations within the partition, rather than taking the average in each partition (as we can’t take the average of categories).\nThe second is that we can no longer use RSS to choose the partition. While we could use the naive approach of looking for partitions that minimize training error, better performing approaches use more sophisticated metrics. Two of the more popular ones are the Gini Index and Entropy.\nIn a perfect scenario, the outcomes in each of our partitions are all of the same category since this will permit perfect accuracy. The Gini Index is going to be 0 in this scenario, and become larger the more we deviate from this scenario. To define the Gini Index, we define \\(\\hat{p}_{j,k}\\) as the proportion of observations in partition \\(j\\) that are of class \\(k\\). The Gini Index is defined as\n\\[\n\\mbox{Gini}(j) = \\sum_{k = 1}^K \\hat{p}_{j,k}(1-\\hat{p}_{j,k})\n\\]\nIf you study the formula carefully you will see that it is in fact 0 in the perfect scenario described above.\nEntropy is a very similar quantity, defined as\n\\[\n\\mbox{entropy}(j) = -\\sum_{k = 1}^K \\hat{p}_{j,k}\\log(\\hat{p}_{j,k}), \\mbox{ with } 0 \\times \\log(0) \\mbox{ defined as }0\n\\]\nLet us look at how a classification tree performs on the digits example we examined before by using this code to run the algorithm and plot the resulting accuracy:\n\ntrain_rpart &lt;- train(y ~ .,\n                     method = \"rpart\",\n                     tuneGrid = data.frame(cp = seq(0.0, 0.1, len = 25)),\n                     data = mnist_27$train)\nplot(train_rpart)\n\n\n\n\n\n\n\nThe accuracy achieved by this approach is better than what we got with regression, but is not as good as what we achieved with kernel methods:\n\ny_hat &lt;- predict(train_rpart, mnist_27$test)\nconfusionMatrix(y_hat, mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;     0.82\n\nThe plot of the estimated conditional probability shows us the limitations of classification trees:\n\n\n\n\n\n\n\n\nNote that with decision trees, it is difficult to make the boundaries smooth since each partition creates a discontinuity.\nClassification trees have certain advantages that make them very useful. They are highly interpretable, even more so than linear models. They are easy to visualize (if small enough). Finally, they can model human decision processes and don’t require use of dummy predictors for categorical variables. On the other hand, the approach via recursive partitioning can easily over-train and is therefore a bit harder to train than, for example, linear regression or kNN. Furthermore, in terms of accuracy, it is rarely the best performing method since it is not very flexible and is highly unstable to changes in training data. Random forests, explained next, improve on several of these shortcomings.\n\n29.5.5 Random forests\nRandom forests are a very popular machine learning approach that addresses the shortcomings of decision trees using a clever idea. The goal is to improve prediction performance and reduce instability by averaging multiple decision trees (a forest of trees constructed with randomness). It has two features that help accomplish this.\nThe first step is bootstrap aggregation or bagging. The general idea is to generate many predictors, each using regression or classification trees, and then forming a final prediction based on the average prediction of all these trees. To assure that the individual trees are not the same, we use the bootstrap to induce randomness. These two features combined explain the name: the bootstrap makes the individual trees randomly different, and the combination of trees is the forest. The specific steps are as follows.\n1. Build \\(B\\) decision trees using the training set. We refer to the fitted models as \\(T_1, T_2, \\dots, T_B\\). We later explain how we ensure they are different.\n2. For every observation in the test set, form a prediction \\(\\hat{y}_j\\) using tree \\(T_j\\).\n3. For continuous outcomes, form a final prediction with the average \\(\\hat{y} = \\frac{1}{B} \\sum_{j = 1}^B \\hat{y}_j\\). For categorical data classification, predict \\(\\hat{y}\\) with majority vote (most frequent class among \\(\\hat{y}_1, \\dots, \\hat{y}_T\\)).\nSo how do we get different decision trees from a single training set? For this, we use randomness in two ways which we explain in the steps below. Let \\(N\\) be the number of observations in the training set. To create \\(T_j, \\, j = 1,\\ldots,B\\) from the training set we do the following:\n1. Create a bootstrap training set by sampling \\(N\\) observations from the training set with replacement. This is the first way to induce randomness.\n2. A large number of features is typical in machine learning challenges. Often, many features can be informative but including them all in the model may result in overfitting. The second way random forests induce randomness is by randomly selecting features to be included in the building of each tree. A different random subset is selected for each tree. This reduces correlation between trees in the forest, thereby improving prediction accuracy.\nTo illustrate how the first steps can result in smoother estimates we will demonstrate by fitting a random forest to the 2008 polls data. We will use the randomForest function in the randomForest package:\n\nlibrary(randomForest)\nfit &lt;- randomForest(margin~., data = polls_2008) \n\nNote that if we apply the function plot to the resulting object, stored in fit, we see how the error rate of our algorithm changes as we add trees.\n\nrafalib::mypar()\nplot(fit)\n\n\n\n\n\n\n\n\n\nWe can see that in this case, the accuracy improves as we add more trees until about 30 trees where accuracy stabilizes.\nThe resulting estimate for this random forest can be seen like this:\n\npolls_2008 |&gt;\n  mutate(y_hat = predict(fit, newdata = polls_2008)) |&gt; \n  ggplot() +\n  geom_point(aes(day, margin)) +\n  geom_line(aes(day, y_hat), col = \"red\")\n\n\n\n\n\n\n\nNotice that the random forest estimate is much smoother than what we achieved with the regression tree in the previous section. This is possible because the average of many step functions can be smooth. We can see this by visually examining how the estimate changes as we add more trees. In the following figure you see each of the bootstrap samples for several values of \\(b\\) and for each one we see the tree that is fitted in grey, the previous trees that were fitted in lighter grey, and the result of averaging all the trees estimated up to that point.\n\n\n\n\n\n\n\n\nHere is the random forest fit for our digits example based on two predictors:\n\nlibrary(randomForest)\ntrain_rf &lt;- randomForest(y ~ ., data = mnist_27$train)\n\nconfusionMatrix(predict(train_rf, mnist_27$test),\n                mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;     0.79\n\nHere is what the conditional probabilities look like:\n\n#&gt; Warning: The following aesthetics were dropped during statistical\n#&gt; transformation: fill\n#&gt; ℹ This can happen when ggplot fails to infer the correct grouping\n#&gt;   structure in the data.\n#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a\n#&gt;   numerical variable into a factor?\n#&gt; The following aesthetics were dropped during statistical\n#&gt; transformation: fill\n#&gt; ℹ This can happen when ggplot fails to infer the correct grouping\n#&gt;   structure in the data.\n#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a\n#&gt;   numerical variable into a factor?\n\n\n\n\n\n\n\nVisualizing the estimate shows that, although we obtain high accuracy, it appears that there is room for improvement by making the estimate smoother. This could be achieved by changing the parameter that controls the minimum number of data points in the nodes of the tree. The larger this minimum, the smoother the final estimate will be. We can train the parameters of the random forest. Below, we use the caret package to optimize over the minimum node size. Because, this is not one of the parameters that the caret package optimizes by default we will write our own code:\n\nnodesize &lt;- seq(1, 51, 10)\nacc &lt;- sapply(nodesize, function(ns){\n  train(y ~ ., method = \"rf\", data = mnist_27$train,\n               tuneGrid = data.frame(mtry = 2),\n               nodesize = ns)$results$Accuracy\n})\nqplot(nodesize, acc)\n\n\n\n\n\n\n\nWe can now fit the random forest with the optimized minimun node size to the entire training data and evaluate performance on the test data.\n\ntrain_rf_2 &lt;- randomForest(y ~ ., data = mnist_27$train,\n                           nodesize = nodesize[which.max(acc)])\n\nconfusionMatrix(predict(train_rf_2, mnist_27$test),\n                mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;     0.83\n\nThe selected model improves accuracy and provides a smoother estimate.\n\n#&gt; Warning: The following aesthetics were dropped during statistical\n#&gt; transformation: fill\n#&gt; ℹ This can happen when ggplot fails to infer the correct grouping\n#&gt;   structure in the data.\n#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a\n#&gt;   numerical variable into a factor?\n#&gt; The following aesthetics were dropped during statistical\n#&gt; transformation: fill\n#&gt; ℹ This can happen when ggplot fails to infer the correct grouping\n#&gt;   structure in the data.\n#&gt; ℹ Did you forget to specify a `group` aesthetic or to convert a\n#&gt;   numerical variable into a factor?\n\n\n\n\n\n\n\nNote that we can avoid writing our own code by using other random forest implementations as described in the caret manual4.\nRandom forest performs better in all the examples we have considered. However, a disadvantage of random forests is that we lose interpretability. An approach that helps with interpretability is to examine variable importance. To define variable importance we count how often a predictor is used in the individual trees. You can learn more about variable importance in an advanced machine learning book5. The caret package includes the function varImp that extracts variable importance from any model in which the calculation is implemented. We give an example on how we use variable importance in the next section."
+    "title": "30  Examples of algorithms",
+    "section": "\n30.4 Classification and regression trees (CART)",
+    "text": "30.4 Classification and regression trees (CART)\n\n30.4.1 The curse of dimensionality\nWe described how methods such as LDA and QDA are not meant to be used with many predictors \\(p\\) because the number of parameters that we need to estimate becomes too large. For example, with the digits example \\(p = 784\\), we would have over 600,000 parameters with LDA, and we would multiply that by the number of classes for QDA. Kernel methods, such as kNN or local regression, do not have model parameters to estimate. However, they also face a challenge when multiple predictors are used due to what is referred to as the curse of dimensionality. The dimension here refers to the fact that when we have \\(p\\) predictors, the distance between two observations is computed in \\(p\\)-dimensional space.\nA useful way of understanding the curse of dimensionality is by considering how large we have to make a span/neighborhood/window to include a given percentage of the data. Remember that with larger neighborhoods, our methods lose flexibility, and to be flexible we need to keep the neighborhoods small.\nTo see how this becomes an issue for higher dimensions, suppose we have one continuous predictor with equally spaced points in the [0,1] interval and we want to create windows that include 1/10th of data. Then it’s easy to see that our windows have to be of size 0.1:\n\n\n\n\n\n\n\n\nNow for two predictors, if we decide to keep the neighborhood just as small, 10% for each dimension, we include only 1 point. If we want to include 10% of the data, then we need to increase the size of each side of the square to \\(\\sqrt{.10} \\approx .316\\):\n\n\n\n\n\n\n\n\nUsing the same logic, if we want to include 10% of the data in a three-dimensional space, then the side of each cube is \\(\\sqrt[3]{.10} \\approx 0.464\\). In general, to include 10% of the data in a case with \\(p\\) dimensions, we need an interval with each side of size \\(\\sqrt[p]{.10}\\) of the total. This proportion gets close to 1 quickly, and if the proportion is 1 it means we include all the data and are no longer smoothing.\n\n\n\n\n\n\n\n\nBy the time we reach 100 predictors, the neighborhood is no longer very local, as each side covers almost the entire dataset.\nHere we look at a set of elegant and versatile methods that adapt to higher dimensions and also allow these regions to take more complex shapes while still producing models that are interpretable. These are very popular, well-known and studied methods. We will concentrate on regression and decision trees and their extension to random forests.\n\n30.4.2 CART motivation\nTo motivate this section, we will use a new dataset that includes the breakdown of the composition of olive oil into 8 fatty acids:\n\nnames(olive)\n#&gt;  [1] \"region\"      \"area\"        \"palmitic\"    \"palmitoleic\"\n#&gt;  [5] \"stearic\"     \"oleic\"       \"linoleic\"    \"linolenic\"  \n#&gt;  [9] \"arachidic\"   \"eicosenoic\"\n\nFor illustrative purposes, we will try to predict the region using the fatty acid composition values as predictors.\n\ntable(olive$region)\n#&gt; \n#&gt; Northern Italy       Sardinia Southern Italy \n#&gt;            151             98            323\n\nWe remove the area column because we won’t use it as a predictor.\n\nolive &lt;- select(olive, -area)\n\nUsing kNN, we can achieve a test set accuracy of 0.9717332. However, a bit of data exploration reveals that we should be able to do even better. For example, if we look at the distribution of each predictor stratified by region we see that eicosenoic is only present in Southern Italy and that linoleic separates Northern Italy from Sardinia.\n\n\n\n\n\n\n\n\nThis implies that we should be able to build an algorithm that predicts perfectly! We can see this clearly by plotting the values for eicosenoic and linoleic.\n\n\n\n\n\n\n\n\nIn Section 21.4, we defined predictor spaces, which in this case consists of eight-dimensional points with values between 0 and 100. In the plot above, we show the space defined by the two predictors eicosenoic and linoleic, and, by eye, we can construct a prediction rule that partitions the predictor space so that each partition contains only outcomes of a one category.\nThis in turn can be used to define an algorithm with perfect accuracy. Specifically, we define the following decision rule: if eicosenoic is larger than 0.065, predict Southern Italy. If not, then if linoleic is larger than \\(10.535\\), predict Sardinia, and if lower, predict Northern Italy. We can draw this decision tree as follows:\n\n\n\n\n\n\n\n\nDecision trees like this are often used in practice. For example, to decide on a person’s risk of poor outcome after having a heart attack, doctors use the following:\n\n(Source: Walton 2010 Informal Logic, Vol. 30, No. 2, pp. 159-1842.)\nA tree is basically a flow chart of yes or no questions. The general idea of the methods we are describing is to define an algorithm that uses data to create these trees with predictions at the ends, referred to as nodes. Regression and decision trees operate by predicting an outcome variable \\(y\\) by partitioning the predictors.\n\n30.4.3 Regression trees\nWhen using trees, and the outcome is continuous, we call the approach a regression tree. To introduce regression trees, we will use the 2008 poll data used in previous sections to describe the basic idea of how we build these algorithms. As with other machine learning algorithms, we will try to estimate the conditional expectation \\(f(x) = \\mbox{E}(Y | X = x)\\) with \\(Y\\) the poll margin and \\(x\\) the day.\n\n\n\n\n\n\n\n\nThe general idea here is to build a decision tree and, at the end of each node, obtain a predictor \\(\\hat{y}\\). A mathematical way to describe this is: we are partitioning the predictor space into \\(J\\) non-overlapping regions, \\(R_1, R_2, \\ldots, R_J\\), and then for any predictor \\(x\\) that falls within region \\(R_j\\), estimate \\(f(x)\\) with the average of the training observations \\(y_i\\) for which the associated predictor \\(x_i\\) is also in \\(R_j\\).\nBut how do we decide on the partition \\(R_1, R_2, \\ldots, R_J\\) and how do we choose \\(J\\)? Here is where the algorithm gets a bit complicated.\nRegression trees create partitions recursively. We start the algorithm with one partition, the entire predictor space. In our simple first example, this space is the interval [-155, 1]. But after the first step, we will have two partitions. After the second step, we will split one of these partitions into two and will have three partitions, then four, and so on. Later we describe how we pick the partition to further partition, and when to stop.\nFor each existing partition, we find a predictor \\(j\\) and value \\(s\\) that define two new partitions, which we will call \\(R_1(j,s)\\) and \\(R_2(j,s)\\), that split our observations in the current partition by asking if \\(x_j\\) is bigger than \\(s\\):\n\\[\nR_1(j,s) = \\{\\mathbf{x} \\mid x_j &lt; s\\} \\mbox{  and  } R_2(j,s) = \\{\\mathbf{x} \\mid x_j \\geq s\\}\n\\]\nIn our current example, we only have one predictor, so we will always choose \\(j = 1\\), but in general this will not be the case. Now after we define the new partitions \\(R_1\\) and \\(R_2\\), and we decide to stop the partitioning, we compute predictors by taking the average of all the observations \\(y\\) for which the associated \\(\\mathbf{x}\\) is in \\(R_1\\) and \\(R_2\\). We refer to these two as \\(\\hat{y}_{R_1}\\) and \\(\\hat{y}_{R_2}\\) respectively.\nBut how do we pick \\(j\\) and \\(s\\)? Basically we find the pair that minimizes the residual sum of squares (RSS):\n\\[\n\\sum_{i:\\, x_i \\in R_1(j,s)} (y_i - \\hat{y}_{R_1})^2 +\n\\sum_{i:\\, x_i \\in R_2(j,s)} (y_i - \\hat{y}_{R_2})^2\n\\]\nThis is then applied recursively to the new regions \\(R_1\\) and \\(R_2\\). We describe how we stop later, but once we are done partitioning the predictor space into regions, in each region a prediction is made using the observations in that region.\nLet’s take a look at what this algorithm does on the 2008 presidential election poll data. We will use the rpart function in the rpart package.\n\nlibrary(rpart)\nfit &lt;- rpart(margin ~ ., data = polls_2008)\n\nIn this case, there is only one predictor. Thus we do not have to decide which predictor \\(j\\) to split by, we simply have to decide what value \\(s\\) we use to split. We can visually see where the splits were made:\n\nplot(fit, margin = 0.1)\ntext(fit, cex = 0.75)\n\n\n\n\n\n\n\n\n\nThe first split is made on day 39.5. One of those regions is then split at day 86.5. The two resulting new partitions are split on days 49.5 and 117.5, respectively, and so on. We end up with 8 partitions. The final estimate \\(\\hat{f}(x)\\) looks like this:\n\npolls_2008 |&gt; \n  mutate(y_hat = predict(fit)) |&gt; \n  ggplot() +\n  geom_point(aes(day, margin)) +\n  geom_step(aes(day, y_hat), col = \"red\")\n\n\n\n\n\n\n\nNote that the algorithm stopped partitioning at 8. The decision is made based on a measure referred to as complexity parameter (cp). Every time we split and define two new partitions, our training set RSS decreases. This is because with more partitions, our model has more flexibility to adapt to the training data. In fact, if you split until every point is its own partition, then RSS goes all the way down to 0 since the average of one value is that same value. To avoid this, the algorithm sets a minimum for how much the RSS must improve for another partition to be added. This parameter is referred to as the complexity parameter (cp). The RSS must improve by a factor of cp for the new partition to be added. Large values of cp will therefore force the algorithm to stop earlier, which results in fewer nodes.\nHowever, cp is not the only parameter used to decide if we should partition a current partition or not. Another common parameter is the minimum number of observations required in a partition before partitioning it further. The argument used in the rpart function is minsplit and the default is 20. The rpart implementation of regression trees also permits users to determine a minimum number of observations in each node. The argument is minbucket and defaults to round(minsplit/3).\nAs expected, if we set cp = 0 and minsplit = 2, then our prediction is as flexible as possible and our predictor is our original data:\n\n\n\n\n\n\n\n\nIntuitively we know that this is not a good approach as it will generally result in over-training. These cp, minsplit, and minbucket, three parameters can be used to control the variability of the final predictors. The larger these values are the more data is averaged to compute a predictor and thus reduce variability. The drawback is that it restricts flexibility.\nSo how do we pick these parameters? We can use cross validation, just like with any tuning parameter. Here is the resulting tree when we use cross validation to choose cp:\n\n\n\n\n\n\n\n\nNote that if we already have a tree and want to apply a higher cp value, we can use the prune function. We call this pruning a tree because we are snipping off partitions that do not meet a cp criterion. Here is an example where we create a tree that used a cp = 0 and then we prune it back:\n\nfit &lt;- rpart(margin ~ ., data = polls_2008, control = rpart.control(cp = 0))\npruned_fit &lt;- prune(fit, cp = 0.01)\n\n\n30.4.4 Classification (decision) trees\nClassification trees, or decision trees, are used in prediction problems where the outcome is categorical. We use the same partitioning principle with some differences to account for the fact that we are now working with a categorical outcome.\nThe first difference is that we form predictions by calculating which class is the most common among the training set observations within the partition, rather than taking the average in each partition (as we can’t take the average of categories).\nThe second is that we can no longer use RSS to choose the partition. While we could use the naive approach of looking for partitions that minimize training error, better performing approaches use more sophisticated metrics. Two of the more popular ones are the Gini Index and Entropy.\nIn a perfect scenario, the outcomes in each of our partitions are all of the same category since this will permit perfect accuracy. The Gini Index is going to be 0 in this scenario, and become larger the more we deviate from this scenario. To define the Gini Index, we define \\(\\hat{p}_{j,k}\\) as the proportion of observations in partition \\(j\\) that are of class \\(k\\). The Gini Index is defined as:\n\\[\n\\mbox{Gini}(j) = \\sum_{k = 1}^K \\hat{p}_{j,k}(1-\\hat{p}_{j,k})\n\\]\nIf you study the formula carefully, you will see that it is in fact 0 in the perfect scenario described above.\nEntropy is a very similar quantity, defined as:\n\\[\n\\mbox{entropy}(j) = -\\sum_{k = 1}^K \\hat{p}_{j,k}\\log(\\hat{p}_{j,k}), \\mbox{ with } 0 \\times \\log(0) \\mbox{ defined as }0\n\\]\nIf we use a classification tree on the 2 or 7 example, we achieve an accuracy of 0.81 which is better than regression, but is not as good as what we achieved with kernel methods.\nThe plot of the estimated conditional probability shows us the limitations of classification trees:\n\n\n\n\n\n\n\n\nNote that with decision trees, it is difficult to make the boundaries smooth since each partition creates a discontinuity.\nClassification trees have certain advantages that make them very useful. They are highly interpretable, even more so than linear models. They are also easy to visualize (if small enough). Finally, they can model human decision processes and don’t require use of dummy predictors for categorical variables. On the other hand, the approach via recursive partitioning can easily over-train and is therefore a bit harder to train than, for example, linear regression or kNN. Furthermore, in terms of accuracy, it is rarely the best performing method since it is not very flexible and is highly unstable to changes in training data. Random forests, explained next, improve on several of these shortcomings."
+  },
+  {
+    "objectID": "ml/algorithms.html#random-forests",
+    "href": "ml/algorithms.html#random-forests",
+    "title": "30  Examples of algorithms",
+    "section": "\n30.5 Random forests",
+    "text": "30.5 Random forests\nRandom forests are a very popular machine learning approach that addresses the shortcomings of decision trees using a clever idea. The goal is to improve prediction performance and reduce instability by averaging multiple decision trees: a forest of trees constructed with randomness. It has two features that help accomplish this.\nThe first step is bootstrap aggregation or bagging. The general idea is to generate many predictors, each using regression or classification trees, and then forming a final prediction based on the average prediction of all these trees. To assure that the individual trees are not the same, we use the bootstrap to induce randomness. These two features combined explain the name: the bootstrap makes the individual trees randomly different, and the combination of trees is the forest. The specific steps are as follows.\n1. Build \\(B\\) decision trees using the training set. We refer to the fitted models as \\(T_1, T_2, \\dots, T_B\\).\n2. For every observation in the test set, form a prediction \\(\\hat{y}_j\\) using tree \\(T_j\\).\n3. For continuous outcomes, form a final prediction with the average \\(\\hat{y} = \\frac{1}{B} \\sum_{j = 1}^B \\hat{y}_j\\). For categorical data classification, predict \\(\\hat{y}\\) with majority vote (most frequent class among \\(\\hat{y}_1, \\dots, \\hat{y}_T\\)).\nSo how do we get different decision trees from a single training set? For this, we use randomness in two ways which we explain in the steps below. Let \\(N\\) be the number of observations in the training set. To create \\(T_j, \\, j = 1,\\ldots,B\\) from the training set we do the following:\n1. Create a bootstrap training set by sampling \\(N\\) observations from the training set with replacement. This is the first way to induce randomness.\n2. A large number of features is typical in machine learning challenges. Often, many features can be informative but including them all in the model may result in overfitting. The second way random forests induce randomness is by randomly selecting features to be included in the building of each tree. A different random subset is selected for each tree. This reduces correlation between trees in the forest, thereby improving prediction accuracy.\nTo illustrate how the first steps can result in smoother estimates, we will fit a random forest to the 2008 polls data. We will use the randomForest function in the randomForest package:\n\nlibrary(randomForest)\nfit &lt;- randomForest(margin ~ ., data = polls_2008) \n\nNote that if we apply the function plot to the resulting object, we see how the error rate of our algorithm changes as we add trees:\n\nrafalib::mypar()\nplot(fit)\n\n\n\n\n\n\n\n\n\nIn this case, the accuracy improves as we add more trees until we have used about 30 trees after which accuracy stabilizes.\nThe resulting estimate for this random forest, obtained with\n\ny_hat &lt;-  predict(fit, newdata = polls_2008\n\nis shown with the red curve below:\n\n\n\n\n\n\n\n\nNotice that the random forest estimate is much smoother than what we achieved with the regression tree in the previous section. This is possible because the average of many step functions can be smooth. We can see this by visually examining how the estimate changes as we add more trees. In the following figure, you see each of the bootstrap samples for several values of \\(b\\) and for each one we see the tree that is fitted in grey, the previous trees that were fitted in lighter grey, and the result of averaging all the trees estimated up to that point.\n\n\n\n\n\n\n\n\n\nlibrary(randomForest)\ntrain_rf &lt;- randomForest(y ~ ., data = mnist_27$train)\n\nThe accuracy for the random forest fit for our 2 or 7 example is confusionMatrix(predict(train_rf, mnist_27$test), mnist_27$test$y)$overall[\"Accuracy\". Here is what the conditional probabilities look like:\n\n\n\n\n\n\n\n\nVisualizing the estimate shows that, although we obtain high accuracy, it appears that there is room for improvement by making the estimate smoother. This could be achieved by changing the parameter that controls the minimum number of data points in the nodes of the tree. The larger this minimum, the smoother the final estimate will be. If we use a node size of 31, the number of neighbors we used with kNN, we get an accuracy of confusionMatrix(predict(train_rf_2, mnist_27$test), mnist_27$test$y)$overall[\"Accuracy\"]. The selected model improves accuracy and provides a smoother estimate:\n\n\n\n\n\n\n\n\nRandom forest performs better than trees in all the examples we have considered. However, a disadvantage of random forests is that we lose interpretability. An approach that helps with interpretability is to examine variable importance. To define variable importance, we count how often a predictor is used in the individual trees. You can learn more about variable importance in an advanced machine learning book3. The caret package includes the function varImp that extracts variable importance from any model in which the calculation is implemented. We give an example on how we use variable importance in the next section."
   },
   {
     "objectID": "ml/algorithms.html#exercises",
     "href": "ml/algorithms.html#exercises",
-    "title": "\n29  Examples of algorithms\n",
-    "section": "\n29.6 Exercises",
-    "text": "29.6 Exercises\n1. Create a dataset using the following code.\n\nn &lt;- 100\nSigma &lt;- 9*matrix(c(1.0, 0.5, 0.5, 1.0), 2, 2)\ndat &lt;- MASS::mvrnorm(n = 100, c(69, 69), Sigma) |&gt;\n  data.frame() |&gt; setNames(c(\"x\", \"y\"))\n\nUse the caret package to partition into a test and training set of equal size. Train a linear model and report the RMSE. Repeat this exercise 100 times and make a histogram of the RMSEs and report the average and standard deviation. Hint: adapt the code shown earlier like this:\n\ny &lt;- dat$y\ntest_index &lt;- createDataPartition(y, times = 1, p = 0.5, list = FALSE)\ntrain_set &lt;- dat |&gt; slice(-test_index)\ntest_set &lt;- dat |&gt; slice(test_index)\nfit &lt;- lm(y ~ x, data = train_set)\ny_hat &lt;- fit$coef[1] + fit$coef[2]*test_set$x\nsqrt(mean((y_hat - test_set$y)^2))\n\nand put it inside a call to replicate.\n2. Now we will repeat the above but using larger datasets. Repeat exercise 1 but for datasets with n &lt;- c(100, 500, 1000, 5000, 10000). Save the average and standard deviation of RMSE from the 100 repetitions. Hint: use the sapply or map functions.\n3. Describe what you observe with the RMSE as the size of the dataset becomes larger.\n\nOn average, the RMSE does not change much as n gets larger, while the variability of RMSE does decrease.\nBecause of the law of large numbers, the RMSE decreases: more data, more precise estimates.\n\nn = 10000 is not sufficiently large. To see a decrease in RMSE, we need to make it larger.\nThe RMSE is not a random variable.\n\n4. Now repeat exercise 1, but this time make the correlation between x and y larger by changing Sigma like this:\n\nn &lt;- 100\nSigma &lt;- 9*matrix(c(1, 0.95, 0.95, 1), 2, 2)\ndat &lt;- MASS::mvrnorm(n = 100, c(69, 69), Sigma) |&gt;\n  data.frame() |&gt; setNames(c(\"x\", \"y\"))\n\nRepeat the exercise and note what happens to the RMSE now.\n5. Which of the following best explains why the RMSE in exercise 4 is so much lower than exercise 1.\n\nIt is just luck. If we do it again, it will be larger.\nThe Central Limit Theorem tells us the RMSE is normal.\nWhen we increase the correlation between x and y, x has more predictive power and thus provides a better estimate of y. This correlation has a much bigger effect on RMSE than n. Large n simply provide us more precise estimates of the linear model coefficients.\nThese are both examples of regression, so the RMSE has to be the same.\n\n6. Create a dataset using the following code:\n\nn &lt;- 1000\nSigma &lt;- matrix(c(1, 3/4, 3/4, 3/4, 1, 0, 3/4, 0, 1), 3, 3)\ndat &lt;- MASS::mvrnorm(n = 100, c(0, 0, 0), Sigma) |&gt;\n  data.frame() |&gt; setNames(c(\"y\", \"x_1\", \"x_2\"))\n\nNote that y is correlated with both x_1 and x_2, but the two predictors are independent of each other.\n\ncor(dat)\n\nUse the caret package to partition into a test and training set of equal size. Compare the RMSE when using just x_1, just x_2, and both x_1 and x_2. Train a linear model and report the RMSE.\n7. Repeat exercise 6 but now create an example in which x_1 and x_2 are highly correlated:\n\nn &lt;- 1000\nSigma &lt;- matrix(c(1.0, 0.75, 0.75, 0.75, 1.0, 0.95, 0.75, 0.95, 1.0), 3, 3)\ndat &lt;- MASS::mvrnorm(n = 100, c(0, 0, 0), Sigma) |&gt;\n  data.frame() |&gt; setNames(c(\"y\", \"x_1\", \"x_2\"))\n\nUse the caret package to partition into a test and training set of equal size. Compare the RMSE when using just x_1, just x_2, and both x_1 and x_2 Train a linear model and report the RMSE.\n8. Compare the results in 6 and 7 and choose the statement you agree with:\n\nAdding extra predictors can improve RMSE substantially, but not when they are highly correlated with another predictor.\nAdding extra predictors improves predictions equally in both exercises.\nAdding extra predictors results in over fitting.\nUnless we include all predictors, we have no predicting power.\n\n9. Define the following dataset:\n\nmake_data &lt;- function(n = 1000, p = 0.5, \n                      mu_0 = 0, mu_1 = 2, \n                      sigma_0 = 1,  sigma_1 = 1){\n  y &lt;- rbinom(n, 1, p)\n  f_0 &lt;- rnorm(n, mu_0, sigma_0)\n  f_1 &lt;- rnorm(n, mu_1, sigma_1)\n  x &lt;- ifelse(y == 1, f_1, f_0)\n  test_index &lt;- createDataPartition(y, times = 1, p = 0.5, list = FALSE)\n  list(train = data.frame(x = x, y = as.factor(y)) |&gt; \n         slice(-test_index),\n       test = data.frame(x = x, y = as.factor(y)) |&gt; \n         slice(test_index))\n}\n\nNote that we have defined a variable x that is predictive of a binary outcome y.\n\ndat$train |&gt; ggplot(aes(x, color = y)) + geom_density()\n\nCompare the accuracy of linear regression and logistic regression.\n10. Repeat the simulation from exercise 1 100 times and compare the average accuracy for each method and notice they give practically the same answer.\n11. Generate 25 different datasets changing the difference between the two class: delta &lt;- seq(0, 3, len = 25). Plot accuracy versus delta.\n12. Earlier we used logistic regression to predict sex from height. Use kNN to do the same. Use the code described in this chapter to select the \\(F_1\\) measure and plot it against \\(k\\). Compare to the \\(F_1\\) of about 0.6 we obtained with regression.\n13. Load the following dataset:\nThis dataset includes a matrix x:\n\ndim(tissue_gene_expression$x)\n\nwith the gene expression measured on 500 genes for 189 biological samples representing seven different tissues. The tissue type is stored in y:\n\ntable(tissue_gene_expression$y)\n\nSplit the data in training and test sets, then use kNN to predict tissue type and see what accuracy you obtain. Try it for \\(k = 1, 3, \\dots, 11\\).\n14. We are going to apply LDA and QDA to the tissue_gene_expression dataset. We will start with simple examples based on this dataset and then develop a realistic example.\nCreate a dataset with just the classes “cerebellum” and “hippocampus” (two parts of the brain) and a predictor matrix with 10 randomly selected columns.\n\nset.seed(1993)\ntissues &lt;- c(\"cerebellum\", \"hippocampus\")\nind &lt;- which(tissue_gene_expression$y %in% tissues)\ny &lt;- droplevels(tissue_gene_expression$y[ind])\nx &lt;- tissue_gene_expression$x[ind, ]\nx &lt;- x[, sample(ncol(x), 10)]\n\nUse the train function to estimate the accuracy of LDA.\n15. In this case, LDA fits two 10-dimensional normal distributions. Look at the fitted model by looking at the finalModel component of the result of train. Notice there is a component called means that includes the estimate means of both distributions. Plot the mean vectors against each other and determine which predictors (genes) appear to be driving the algorithm.\n16. Repeat exercises 1 with QDA. Does it have a higher accuracy than LDA?\n17. Are the same predictors (genes) driving the algorithm? Make a plot as in exercise 2.\n18. One thing we see in the previous plot is that the value of predictors correlate in both groups: some predictors are low in both groups while others are high in both groups. The mean value of each predictor, colMeans(x), is not informative or useful for prediction, and often for interpretation purposes it is useful to center or scale each column. This can be achieved with the preProcessing argument in train. Re-run LDA with preProcessing = \"scale\". Note that accuracy does not change but see how it is easier to identify the predictors that differ more between groups in the plot made in exercise 4.\n19. In the previous exercises we saw that both approaches worked well. Plot the predictor values for the two genes with the largest differences between the two groups in a scatterplot to see how they appear to follow a bivariate distribution as assumed by the LDA and QDA approaches. Color the points by the outcome.\n20. Now we are going to increase the complexity of the challenge slightly: we will consider all the tissue types.\n\nset.seed(1993)\ny &lt;- tissue_gene_expression$y\nx &lt;- tissue_gene_expression$x\nx &lt;- x[, sample(ncol(x), 10)]\n\nWhat accuracy do you get with LDA?\n21. We see that the results are slightly worse. Use the confusionMatrix function to learn what type of errors we are making.\n22. Plot an image of the centers of the seven 10-dimensional normal distributions.\n23. Create a simple dataset where the outcome grows 0.75 units on average for every increase in a predictor:\n\nn &lt;- 1000\nsigma &lt;- 0.25\nx &lt;- rnorm(n, 0, 1)\ny &lt;- 0.75 * x + rnorm(n, 0, sigma)\ndat &lt;- data.frame(x = x, y = y)\n\nUse rpart to fit a regression tree and save the result to fit.\n24. Plot the final tree so that you can see where the partitions occurred.\n25. Make a scatterplot of y versus x along with the predicted values based on the fit.\n26. Now model with a random forest instead of a regression tree using randomForest from the randomForest package, and remake the scatterplot with the prediction line.\n27. Use the function plot to see if the random forest has converged or if we need more trees.\n28. It seems that the default values for the random forest result in an estimate that is too flexible (not smooth). Re-run the random forest but this time with nodesize set at 50 and maxnodes set at 25. Remake the plot.\n29. We see that this yields smoother results. Let’s use the train function to help us pick these values. From the caret manual6 we see that we can’t tune the maxnodes parameter or the nodesize argument with randomForest, so we will use the Rborist package and tune the minNode argument. Use the train function to try values minNode &lt;- seq(5, 250, 25). See which value minimizes the estimated RMSE.\n30. Make a scatterplot along with the prediction from the best fitted model.\n31. Use the rpart function to fit a classification tree to the tissue_gene_expression dataset. Use the train function to estimate the accuracy. Try out cp values of seq(0, 0.05, 0.01). Plot the accuracy to report the results of the best model.\n32. Study the confusion matrix for the best fitting classification tree. What do you observe happening for placenta?\n33. Notice that placentas are called endometrium more often than placenta. Note also that the number of placentas is just six, and that, by default, rpart requires 20 observations before splitting a node. Thus it is not possible with these parameters to have a node in which placentas are the majority. Rerun the above analysis but this time permit rpart to split any node by using the argument control = rpart.control(minsplit = 0). Does the accuracy increase? Look at the confusion matrix again.\n34. Plot the tree from the best fitting model obtained in exercise 11.\n35. We can see that with just six genes, we are able to predict the tissue type. Now let’s see if we can do even better with a random forest. Use the train function and the rf method to train a random forest. Try out values of mtry ranging from, at least, seq(50, 200, 25). What mtry value maximizes accuracy? To permit small nodesize to grow as we did with the classification trees, use the following argument: nodesize = 1. This will take several seconds to run. If you want to test it out, try using smaller values with ntree. Set the seed to 1990.\n36. Use the function varImp on the output of train and save it to an object called imp.\n37. The rpart model we ran above produced a tree that used just six predictors. Extracting the predictor names is not straightforward, but can be done. If the output of the call to train was fit_rpart, we can extract the names like this:\n\nind &lt;- !(fit_rpart$finalModel$frame$var == \"&lt;leaf&gt;\")\ntree_terms &lt;- \n  fit_rpart$finalModel$frame$var[ind] |&gt;\n  unique() |&gt;\n  as.character()\ntree_terms\n\nWhat is the variable importance in the random forest call for these predictors? Where do they rank?\n38. Advanced: Extract the top 50 predictors based on importance, take a subset of x with just these predictors and apply the function heatmap to see how these genes behave across the tissues. We will introduce the heatmap function in Chapter Chapter 31."
+    "title": "30  Examples of algorithms",
+    "section": "\n30.6 Exercises",
+    "text": "30.6 Exercises\n1. Create a dataset using the following code:\n\nn &lt;- 100\nSigma &lt;- 9*matrix(c(1.0, 0.5, 0.5, 1.0), 2, 2)\ndat &lt;- MASS::mvrnorm(n = 100, c(69, 69), Sigma) |&gt;\n  data.frame() |&gt; setNames(c(\"x\", \"y\"))\n\nUse the caret package to partition into a test and training set of equal size. Train a linear model and report the RMSE. Repeat this exercise 100 times and make a histogram of the RMSEs and report the average and standard deviation. Hint: adapt the code shown earlier like this:\n\nlibrary(caret)\ny &lt;- dat$y\ntest_index &lt;- createDataPartition(y, times = 1, p = 0.5, list = FALSE)\ntrain_set &lt;- dat |&gt; slice(-test_index)\ntest_set &lt;- dat |&gt; slice(test_index)\nfit &lt;- lm(y ~ x, data = train_set)\ny_hat &lt;- fit$coef[1] + fit$coef[2]*test_set$x\nsqrt(mean((y_hat - test_set$y)^2))\n\nand put it inside a call to replicate.\n2. Now we will repeat the above but using larger datasets. Repeat exercise 1 but for datasets with n &lt;- c(100, 500, 1000, 5000, 10000). Save the average and standard deviation of RMSE from the 100 repetitions. Hint: use the sapply or map functions.\n3. Describe what you observe with the RMSE as the size of the dataset becomes larger.\n\nOn average, the RMSE does not change much as n gets larger, while the variability of RMSE does decrease.\nBecause of the law of large numbers, the RMSE decreases: more data, more precise estimates.\n\nn = 10000 is not sufficiently large. To see a decrease in RMSE, we need to make it larger.\nThe RMSE is not a random variable.\n\n4. Now repeat exercise 1, but this time make the correlation between x and y larger by changing Sigma like this:\n\nn &lt;- 100\nSigma &lt;- 9*matrix(c(1, 0.95, 0.95, 1), 2, 2)\ndat &lt;- MASS::mvrnorm(n = 100, c(69, 69), Sigma) |&gt;\n  data.frame() |&gt; setNames(c(\"x\", \"y\"))\n\nRepeat the exercise and note what happens to the RMSE now.\n5. Which of the following best explains why the RMSE in exercise 4 is so much lower than exercise 1:\n\nIt is just luck. If we do it again, it will be larger.\nThe Central Limit Theorem tells us the RMSE is normal.\nWhen we increase the correlation between x and y, x has more predictive power and thus provides a better estimate of y. This correlation has a much bigger effect on RMSE than n. Large n simply provide us more precise estimates of the linear model coefficients.\nThese are both examples of regression, so the RMSE has to be the same.\n\n6. Create a dataset using the following code:\n\nn &lt;- 1000\nSigma &lt;- matrix(c(1, 3/4, 3/4, 3/4, 1, 0, 3/4, 0, 1), 3, 3)\ndat &lt;- MASS::mvrnorm(n = 100, c(0, 0, 0), Sigma) |&gt;\n  data.frame() |&gt; setNames(c(\"y\", \"x_1\", \"x_2\"))\n\nNote that y is correlated with both x_1 and x_2, but the two predictors are independent of each other.\n\ncor(dat)\n\nUse the caret package to partition into a test and training set of equal size. Compare the RMSE when using just x_1, just x_2, and both x_1 and x_2. Train a linear model and report the RMSE.\n7. Repeat exercise 6 but now create an example in which x_1 and x_2 are highly correlated:\n\nn &lt;- 1000\nSigma &lt;- matrix(c(1.0, 0.75, 0.75, 0.75, 1.0, 0.95, 0.75, 0.95, 1.0), 3, 3)\ndat &lt;- MASS::mvrnorm(n = 100, c(0, 0, 0), Sigma) |&gt;\n  data.frame() |&gt; setNames(c(\"y\", \"x_1\", \"x_2\"))\n\nUse the caret package to partition into a test and training set of equal size. Compare the RMSE when using just x_1, just x_2, and both x_1 and x_2 Train a linear model and report the RMSE.\n8. Compare the results in 6 and 7 and choose the statement you agree with:\n\nAdding extra predictors can improve RMSE substantially, but not when they are highly correlated with another predictor.\nAdding extra predictors improves predictions equally in both exercises.\nAdding extra predictors results in over fitting.\nUnless we include all predictors, we have no predicting power.\n\n9. Define the following dataset:\n\nmake_data &lt;- function(n = 1000, p = 0.5, \n                      mu_0 = 0, mu_1 = 2, \n                      sigma_0 = 1,  sigma_1 = 1){\n  y &lt;- rbinom(n, 1, p)\n  f_0 &lt;- rnorm(n, mu_0, sigma_0)\n  f_1 &lt;- rnorm(n, mu_1, sigma_1)\n  x &lt;- ifelse(y == 1, f_1, f_0)\n  test_index &lt;- createDataPartition(y, times = 1, p = 0.5, list = FALSE)\n  list(train = data.frame(x = x, y = as.factor(y)) |&gt; \n         slice(-test_index),\n       test = data.frame(x = x, y = as.factor(y)) |&gt; \n         slice(test_index))\n}\n\nNote that we have defined a variable x that is predictive of a binary outcome y.\n\ndat$train |&gt; ggplot(aes(x, color = y)) + geom_density()\n\nCompare the accuracy of linear regression and logistic regression.\n10. Repeat the simulation from exercise 1 100 times and compare the average accuracy for each method and notice they give practically the same answer.\n11. Generate 25 different datasets changing the difference between the two class: delta &lt;- seq(0, 3, len = 25). Plot accuracy versus delta.\n12. If we add 1s to our 2 or 7 examples, we get data that looks like this:\n\n\n\n\n\n\n\n\nFit QDA using the qda function in the MASS package the create a confusion matrix for predictions on the test. Which of the following best describes the confusion matrix:\na. It is a two-by-two table. b. Because we have three classes, it is a two-by-three table. c. Because we have three classes, it is a three-by-three table. d. Confusion matrices only make sense when the outcomes are binary.\n13. The byClass component returned by the confusionMatrix object provides sensitivity and specificity for each class. Because these terms only make sense when data is binary, each row represents sensitivity and specificity when a particular class is 1 (positives) and the other two are considered 0s (negatives). Based on the values returned by confusionMatrix, which of the following is the most common mistake:\na. Calling 1s either a 2 or 7. b. Calling 2s either a 1 or 7. c. Calling 7s either a 1 or 2. d. All mistakes are equally common.\n\nCreate a grid of x_1 and x_2 using:\n\n\nGS &lt;- 150\nnew_x &lt;- with(mnist_127$train,\n  expand.grid(x_1 = seq(min(x_1), max(x_1), len = GS),\n              x_2 = seq(min(x_2), max(x_2), len = GS)))\n\nthen visualize the decision rule by coloring the regions of the Cartesian plan to represent the label that would be called in that region.\n14. Repeat exercise 13 but for LDA. Which of the following explains why LDA has worse accuracy:\n\nLDA separates the space with lines making it too rigid.\nLDA divides the space into two and there are three classes.\nLDA is very similar to QDA the difference is due to chance.\nLDA can’t be used with more than one class.\n\n15. Now repeat exercise 13 for kNN with \\(k=31\\) and compute and compare the overall accuracy for all three methods.\n\nTo understand how a simple method like kNN can outperform a model that explicitly tries to emulate Bayes’ rule, explore the conditional distributions of x_1 and x_2 to see if the normal approximation holds. Generative models can be very powerful, but only when we are able to successfully approximate the joint distribution of predictors conditioned on each class.\n\n17. Earlier we used logistic regression to predict sex from height. Use kNN to do the same. Use the code described in this chapter to select the \\(F_1\\) measure and plot it against \\(k\\). Compare to the \\(F_1\\) of about 0.6 we obtained with regression.\n18. Create a simple dataset where the outcome grows 0.75 units on average for every increase in a predictor:\n\nn &lt;- 1000\nsigma &lt;- 0.25\nx &lt;- rnorm(n, 0, 1)\ny &lt;- 0.75 * x + rnorm(n, 0, sigma)\ndat &lt;- data.frame(x = x, y = y)\n\nUse rpart to fit a regression tree and save the result to fit.\n19. Plot the final tree so that you can see where the partitions occurred.\n20. Make a scatterplot of y versus x along with the predicted values based on the fit.\n21. Now model with a random forest instead of a regression tree using randomForest from the randomForest package, and remake the scatterplot with the prediction line.\n22. Use the function plot to see if the random forest has converged or if we need more trees.\n23. It seems that the default values for the random forest result in an estimate that is too flexible (not smooth). Re-run the random forest but this time with nodesize set at 50 and maxnodes set at 25. Remake the plot.\n24. This **dslabs* dataset includes the tissue_gene_expression with a matrix x:\n\nlibrary(dslabs)\ndim(tissue_gene_expression$x)\n\nwith the gene expression measured on 500 genes for 189 biological samples representing seven different tissues. The tissue type is stored in tissue_gene_expression$y.\n\ntable(tissue_gene_expression$y)\n\nFit a random forest using the randomForest function in the package randomForest. Then use the varImp function to see which are the top 10 most predictive genes. Make a histogram of the reported importance to get an idea of the distribution of the importance values.\n\nlibrary(randomForest)\nfit_rf &lt;- with(tissue_gene_expression, randomForest(x, y))\nvi &lt;- varImp(fit_rf) \nvi |&gt; top_n(10, Overall) |&gt; arrange(desc(Overall))\n#&gt;          Overall\n#&gt; GPA33       3.61\n#&gt; KIF2C       2.65\n#&gt; CLIP3       2.55\n#&gt; RARRES2     2.29\n#&gt; COLGALT2    2.27\n#&gt; LRRN3       2.26\n#&gt; CEP55       2.21\n#&gt; GTF2IRD1    2.15\n#&gt; KCTD2       2.12\n#&gt; LTBR        2.01\nhist(vi$Overall, breaks = seq(0,4,0.1))"
   },
   {
     "objectID": "ml/algorithms.html#footnotes",
     "href": "ml/algorithms.html#footnotes",
-    "title": "\n29  Examples of algorithms\n",
+    "title": "30  Examples of algorithms",
     "section": "",
-    "text": "http://www.amazon.com/Mathematical-Statistics-Analysis-Available-Enhanced/dp/0534399428↩︎\nhttps://web.stanford.edu/~hastie/Papers/ESLII.pdf↩︎\nhttps://papers.ssrn.com/sol3/Delivery.cfm/SSRN_ID1759289_code1486039.pdf?abstractid = 1759289&mirid = 1&type = 2↩︎\nhttp://topepo.github.io/caret/available-models.html↩︎\nhttps://web.stanford.edu/~hastie/Papers/ESLII.pdf↩︎\nhttps://topepo.github.io/caret/available-models.html↩︎"
+    "text": "https://web.stanford.edu/~hastie/Papers/ESLII.pdf↩︎\nhttps://papers.ssrn.com/sol3/Delivery.cfm/SSRN_ID1759289_code1486039.pdf?abstractid = 1759289&mirid = 1&type = 2↩︎\nhttps://web.stanford.edu/~hastie/Papers/ESLII.pdf↩︎"
   },
   {
     "objectID": "ml/ml-in-practice.html#sec-caret",
     "href": "ml/ml-in-practice.html#sec-caret",
-    "title": "\n30  Machine learning in practice\n",
-    "section": "\n30.1 The caret package",
-    "text": "30.1 The caret package\nWe have already learned about several machine learning algorithms. Many of these algorithms are implemented in R. However, they are distributed via different packages, developed by different authors, and often use different syntax. The caret package tries to consolidate these differences and provide consistency. It currently includes over 200 different methods which are summarized in the caret package manual1. Keep in mind that caret does not include the needed packages and, to implement a package through caret, you still need to install the library. The required packages for each method are described in the package manual. The caret package also provides a function that performs cross validation for us. Here we provide some examples showing how we use this incredibly helpful package. We will use the 2 or 7 example to illustrate and in later sections we use use the package to run algorithms on the larger MNIST datset.\n\n30.1.1 The train functon\nThe caret train function lets us train different algorithms using similar syntax. So, for example, we can type:\n\nlibrary(caret)\n#&gt; Loading required package: lattice\ntrain_glm &lt;- train(y ~ ., method = \"glm\", data = mnist_27$train)\ntrain_knn &lt;- train(y ~ ., method = \"knn\", data = mnist_27$train)\n\nTo make predictions, we can use the output of this function directly without needing to look at the specifics of predict.glm and predict.knn. Instead, we can learn how to obtain predictions from predict.train.\nThe code looks the same for both methods:\n\ny_hat_glm &lt;- predict(train_glm, mnist_27$test, type = \"raw\")\ny_hat_knn &lt;- predict(train_knn, mnist_27$test, type = \"raw\")\n\nThis permits us to quickly compare the algorithms. For example, we can compare the accuracy like this:\n\nfits &lt;- list(glm = y_hat_glm, knn = y_hat_knn)\nsapply(fits, function(fit) confusionMatrix(fit, mnist_27$test$y)$overall[[\"Accuracy\"]])\n#&gt;  glm  knn \n#&gt; 0.75 0.84\n\n\n30.1.2 Cross validation\nWhen an algorithm includes a tuning parameter, train automatically uses cross validation to decide among a few default values. To find out what parameter or parameters are optimized, you can read the manual 2 or study the output of:\n\ngetModelInfo(\"knn\")\n\nWe can also use a quick lookup like this:\n\nmodelLookup(\"knn\")\n\nIf we run it with default values:\n\ntrain_knn &lt;- train(y ~ ., method = \"knn\", data = mnist_27$train)\n\nyou can quickly see the results of the cross validation using the ggplot function. The argument highlight highlights the max:\n\nggplot(train_knn, highlight = TRUE)\n\n\n\n\n\n\n\nBy default, the cross validation is performed by taking 25 bootstrap samples comprised of 25% of the observations. For the kNN method, the default is to try \\(k=5,7,9\\). We change this using the tuneGrid parameter. The grid of values must be supplied by a data frame with the parameter names as specified in the modelLookup output.\nHere, we present an example where we try out 30 values between 9 and 67. To do this with caret, we need to define a column named k, so we use this: data.frame(k = seq(9, 67, 2)). Note that when running this code, we are fitting 30 versions of kNN to 25 bootstrapped samples. Since we are fitting \\(30 \\times 25 = 750\\) kNN models, running this code will take several seconds. We set the seed because cross validation is a random procedure and we want to make sure the result here is reproducible.\n\nset.seed(2008)\ntrain_knn &lt;- train(y ~ ., method = \"knn\", \n                   data = mnist_27$train,\n                   tuneGrid = data.frame(k = seq(9, 71, 2)))\nggplot(train_knn, highlight = TRUE)\n\n\n\n\n\n\n\nTo access the parameter that maximized the accuracy, you can use this:\n\ntrain_knn$bestTune\n#&gt;     k\n#&gt; 10 27\n\nand the best performing model like this:\n\ntrain_knn$finalModel\n#&gt; 27-nearest neighbor model\n#&gt; Training set outcome distribution:\n#&gt; \n#&gt;   2   7 \n#&gt; 379 421\n\nThe function predict will use this best performing model. Here is the accuracy of the best model when applied to the test set, which we have not used at all yet because the cross validation was done on the training set:\n\nconfusionMatrix(predict(train_knn, mnist_27$test, type = \"raw\"),\n                mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.835\n\nIf we want to change how we perform cross validation, we can use the trainControl function. We can make the code above go a bit faster by using, for example, 10-fold cross validation. This means we have 10 samples using 10% of the observations each. We accomplish this using the following code:\n\ncontrol &lt;- trainControl(method = \"cv\", number = 10, p = .9)\ntrain_knn_cv &lt;- train(y ~ ., method = \"knn\", \n                   data = mnist_27$train,\n                   tuneGrid = data.frame(k = seq(9, 71, 2)),\n                   trControl = control)\nggplot(train_knn_cv, highlight = TRUE)\n\n\n\n\n\n\n\nWe notice that the accuracy estimates are more variable, which is expected since we changed the number of samples used to estimate accuracy.\nNote that results component of the train output includes several summary statistics related to the variability of the cross validation estimates:\n\nnames(train_knn$results)\n#&gt; [1] \"k\"          \"Accuracy\"   \"Kappa\"      \"AccuracySD\" \"KappaSD\"\n\nWe have only covered the basics. To caret package manual 3 includes many more details."
+    "title": "31  Machine learning in practice",
+    "section": "\n31.1 The caret package",
+    "text": "31.1 The caret package\nWe have already learned about several machine learning algorithms. Many of these algorithms are implemented in R. However, they are distributed via different packages, developed by different authors, and often use different syntax. The caret package tries to consolidate these differences and provide consistency. It currently includes over 200 different methods which are summarized in the caret package manual1. Keep in mind that caret does not include the packages needed to run each possible algorithm. To apply a machine learning method through caret you still need to install the library that implement the method. The required packages for each method are described in the package manual.\nThe caret package also provides a function that performs cross validation for us. Here we provide some examples showing how we use this helpful package. We will first use the 2 or 7 example to illustrate and, in later sections, we use the package to run algorithms on the larger MNIST dataset.\n\n31.1.1 The train function\nThe R functions that fit machine algorithms are all slightly different. Functions such as lm, glm, qda, lda, knn3, rpart and randomForrest use different syntax, have different argument names and produce objects of different types.\nThe caret train function lets us train different algorithms using similar syntax. So, for example, we can type:\n\nlibrary(caret)\n#&gt; Loading required package: lattice\ntrain_glm &lt;- train(y ~ ., method = \"glm\", data = mnist_27$train)\ntrain_qda &lt;- train(y ~ ., method = \"qda\", data = mnist_27$train)\ntrain_knn &lt;- train(y ~ ., method = \"knn\", data = mnist_27$train)\n\nAs we explain in more detail in Section 31.1.3, the train function selects parameters for you using a resampling method, with boostrap as the default.\n\n31.1.2 The predict function\nThe predict function is very useful for machine learning applications. This function takes an object from a fitting function and a data frame with features \\(\\mathbf{x}\\) for which to predict, and returns predictions for these features.\nHere is an example with logistic regression:\n\nfit &lt;- glm(y ~ ., data = mnist_27$train, family = \"binomial\")\np_hat &lt;- predict(fit, mnist_27$test)\n\nIn this case, the function is simply computing:\n\\[\n\\hat{p}(\\mathbf{x}) = g^{-1}\\left(\\hat{\\beta}_0 + \\hat{\\beta}_1 x_1 + \\hat{\\beta}_2 x_2 \\right) \\text{ with } g(p) = \\log\\frac{p}{1-p} \\implies g^{-1}(\\mu) = \\frac{1}{1-e^{-\\mu}}\n\\]\nfor the x_1 and x_2 in the test set mnist_27$test. With these estimates in place, we can make our predictions and compute our accuracy:\n\ny_hat &lt;- factor(ifelse(p_hat &gt; 0.5, 7, 2))\n\nHowever, note that predict does not always return objects of the same types; it depends on what type of object it is applied to. To learn about the specifics, you need to look at the help file specific for the type of fit object that is being used. The predict is actually a special type of function in R (called a generic function) that calls other functions depending on what kind of object it receives. So if predict receives an object coming out of the lm function, it will call predict.glm. If it receives an object coming out of glm, it calls predict.qda. If the fit is from knn3, it calls predict.knn3, and so on. These functions are similar but not exactly. You can learn more about the differences by reading the help files:\n\n?predict.glm\n?predict.qda\n?predict.knn3\n\nThere are many other versions of predict and many machine learning algorithms have a predict function.\nAs with train, the caret packages unifies the use of predict with the function predict.train. This function takes the output of train and produces prediction of categories or estimates of \\(p(\\mathbf{x})\\).\nThe code looks the same for all methods:\n\ny_hat_glm &lt;- predict(train_glm, mnist_27$test, type = \"raw\")\ny_hat_qda &lt;- predict(train_qda, mnist_27$test, type = \"raw\")\ny_hat_knn &lt;- predict(train_knn, mnist_27$test, type = \"raw\")\n\nThis permits us to quickly compare the algorithms. For example, we can compare the accuracy like this:\n\nfits &lt;- list(glm = y_hat_glm, qda = y_hat_qda, knn = y_hat_knn)\nsapply(fits, function(fit) confusionMatrix(fit, mnist_27$test$y)$overall[[\"Accuracy\"]])\n#&gt;   glm   qda   knn \n#&gt; 0.775 0.815 0.835\n\n\n31.1.3 Cross validation\nWhen an algorithm includes a tuning parameter, train automatically uses cross validation to decide among a few default values. To find out what parameter or parameters are optimized, you can read the manual 2 or study the output of:\n\ngetModelInfo(\"knn\")\n\nWe can also use a quick lookup like this:\n\nmodelLookup(\"knn\")\n\nIf we run it with default values:\n\ntrain_knn &lt;- train(y ~ ., method = \"knn\", data = mnist_27$train)\n\nyou can quickly see the results of the cross validation using the ggplot function. The argument highlight highlights the max:\n\nggplot(train_knn, highlight = TRUE)\n\n\n\n\n\n\n\nBy default, the cross validation is performed by taking 25 bootstrap samples comprised of 25% of the observations. For the kNN method, the default is to try \\(k=5,7,9\\). We change this using the tuneGrid argument. The grid of values must be supplied by a data frame with the parameter names as specified in the modelLookup output.\nHere we present an example where we try out 30 values between 9 and 67. To do this with caret, we need to define a column named k, so we use this: data.frame(k = seq(9, 67, 2)). Note that when running this code, we are fitting 30 versions of kNN to 25 bootstrapped samples. Since we are fitting \\(30 \\times 25 = 750\\) kNN models, running this code will take several seconds. We set the seed because cross validation is a random procedure and we want to make sure the result here is reproducible.\n\nset.seed(2008)\ntrain_knn &lt;- train(y ~ ., method = \"knn\", \n                   data = mnist_27$train,\n                   tuneGrid = data.frame(k = seq(9, 71, 2)))\nggplot(train_knn, highlight = TRUE)\n\n\n\n\n\n\n\nTo access the parameter that maximized the accuracy, you can use this:\n\ntrain_knn$bestTune\n#&gt;     k\n#&gt; 27 61\n\nand the best performing model like this:\n\ntrain_knn$finalModel\n#&gt; 61-nearest neighbor model\n#&gt; Training set outcome distribution:\n#&gt; \n#&gt;   2   7 \n#&gt; 401 399\n\nThe function predict will use this best performing model. Here is the accuracy of the best model when applied to the test set, which we have not yet used because the cross validation was done on the training set:\n\nconfusionMatrix(predict(train_knn, mnist_27$test, type = \"raw\"),\n                mnist_27$test$y)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.825\n\nIf we want to change how we perform cross validation, we can use the trainControl function. We can make the code above go a bit faster by using, for example, 10-fold cross validation. This means we have 10 samples using 10% of the observations each. We accomplish this using the following code:\n\ncontrol &lt;- trainControl(method = \"cv\", number = 10, p = .9)\ntrain_knn_cv &lt;- train(y ~ ., method = \"knn\", \n                   data = mnist_27$train,\n                   tuneGrid = data.frame(k = seq(9, 71, 2)),\n                   trControl = control)\nggplot(train_knn_cv, highlight = TRUE)\n\n\n\n\n\n\n\nWe observe that the accuracy estimates are more variable, which is expected since we changed the number of samples used to estimate accuracy.\nNote that results component of the train output includes several summary statistics related to the variability of the cross validation estimates:\n\nnames(train_knn$results)\n#&gt; [1] \"k\"          \"Accuracy\"   \"Kappa\"      \"AccuracySD\" \"KappaSD\"\n\nYou can learn many more details about the caret package, from the manual 3."
   },
   {
     "objectID": "ml/ml-in-practice.html#preprocessing",
     "href": "ml/ml-in-practice.html#preprocessing",
-    "title": "\n30  Machine learning in practice\n",
-    "section": "\n30.2 Preprocessing",
-    "text": "30.2 Preprocessing\nIn machine learning, we often transform predictors before running the machine algorithm. We also remove predictors that are clearly not useful. We call these steps preprocessing.\nExamples of preprocessing include standardizing the predictors, taking the log transform of some predictors, removing predictors that are highly correlated with others, and removing predictors with very few non-unique values or close to zero variation. We show an example below.\nWe can run the nearZero function from the caret package to see that several features do not vary much from observation to observation. We can see that there is a large number of features with 0 variability:\n\nlibrary(matrixStats)\nsds &lt;- colSds(x)\nhist(sds, breaks = 256)\n\n\n\n\n\n\n\nThis is expected because there are parts of the image that rarely contain writing (dark pixels).\nThe caret packages includes a function that recommends features to be removed due to near zero variance:\n\nnzv &lt;- nearZeroVar(x)\n\nWe can see the columns recommended for removal:\n\nimage(matrix(1:784 %in% nzv, 28, 28))\n\n\nrafalib::mypar()\nimage(matrix(1:784 %in% nzv, 28, 28))\n\n\n\n\n\n\n\nSo we end up keeping this number of columns:\n\ncol_index &lt;- setdiff(1:ncol(x), nzv)\nlength(col_index)\n#&gt; [1] 252\n\nNow we are ready to fit some models. Before we start, we need to add column names to the feature matrices as these are required by caret:\n\ncolnames(x) &lt;- 1:ncol(mnist$train$images)\ncolnames(x_test) &lt;- colnames(x)"
+    "title": "31  Machine learning in practice",
+    "section": "\n31.2 Preprocessing",
+    "text": "31.2 Preprocessing\nWe often transform predictors before running the machine algorithm. We also remove predictors that are clearly not useful. We call these steps preprocessing.\nExamples of preprocessing include standardizing the predictors, taking the log transform of some predictors, removing predictors that are highly correlated with others, and removing predictors with very few non-unique values or close to zero variation.\nFor example, we can run the nearZero function from the caret package to see that several features do not vary much from observation to observation. We can see that there is a large number of features with close to 0 variability:\n\nlibrary(matrixStats)\nsds &lt;- colSds(x)\nhist(sds, breaks = 256)\n\n\n\n\n\n\n\nThis is expected because there are parts of the image that rarely contain writing (dark pixels).\nThe caret packages includes a function that recommends features to be removed due to near zero variance:\n\nnzv &lt;- nearZeroVar(x)\n\nWe can see the columns recommended for removal are the near the edges:\n\nimage(matrix(1:784 %in% nzv, 28, 28))\n\n\nrafalib::mypar()\nimage(matrix(1:784 %in% nzv, 28, 28))\n\n\n\n\n\n\n\nSo we end up keeping this number of columns:\n\ncol_index &lt;- setdiff(1:ncol(x), nzv)\nlength(col_index)\n#&gt; [1] 252\n\nNow we are ready to fit some models. Before we start, we need to add column names to the feature matrices as these are required by caret:\n\ncolnames(x) &lt;- 1:ncol(mnist$train$images)\ncolnames(x_test) &lt;- colnames(x)"
   },
   {
     "objectID": "ml/ml-in-practice.html#k-nearest-neighbors",
     "href": "ml/ml-in-practice.html#k-nearest-neighbors",
-    "title": "\n30  Machine learning in practice\n",
-    "section": "\n30.3 k-nearest neighbors",
-    "text": "30.3 k-nearest neighbors\nBefore starting this section, be warned that the first two calls to the train function in the code below can take several hours to run. This is common challenge when training machine learning algorithms since we have to run the algorithm for each cross validation split and each set of tuning parameter being considered. In the next section, we will provide some suggestion on how to predict the duration of the process and ways to reduce.\nThe first step is to optimize for \\(k\\).\n\ntrain_knn &lt;- train(x[ ,col_index], y, \n                   method = \"knn\", \n                   tuneGrid = data.frame(k = seq(3, 13, 2)))\n\nOnce we optimize our algorithm, we can fit it to the entire dataset:\n\nfit_knn &lt;- knn3(x[, col_index], y,  k = train_knn$bestTune)\n\nWe achieve a high accuracy:\n\ny_hat_knn &lt;- predict(fit_knn, x_test[, col_index], type = \"class\")\nconfusionMatrix(y_hat_knn, factor(y_test))$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.945\n\nAn alternative to removing low variance columns directly, is to use dimension reduction on the feature matrix before applying the algorithms. It is important that we not use the test set when finding the PCs nor any summary of the data, as this could result in overtraining. So we start by applying prcomp to the training data:\n\ncol_means &lt;- colMeans(x)\npca &lt;- prcomp(sweep(x, 2, col_means), center = FALSE)\n\nNext, and run knn on just a small number of dimensions. We try 36 dimensions since this explains about 80% of the data.\n\nk &lt;- 36\nx_train &lt;- pca$x[,1:k]\ntrain_knn &lt;- train(x_train, y, \n                   method = \"knn\", \n                   tuneGrid = data.frame(k = seq(3, 13, 2)))\nfit &lt;- knn3(x_train, y, k = train_knn$bestTune)\n\nNow we apply the transformation we learned with the training data to the test data, reduce the dimension, and then run predict. Note that we used the rotation and column means estimated from the training data.\n\ny_hat &lt;- predict(fit, sweep(x_test, 2, col_means) %*% pca$rotation[,1:k], type = \"class\")\nconfusionMatrix(y_hat, factor(y_test))$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.962\n\nWith obtain an improvement in accuracy, while using only 36 dimensions."
+    "title": "31  Machine learning in practice",
+    "section": "\n31.3 k-nearest neighbors",
+    "text": "31.3 k-nearest neighbors\n\n\n\n\n\n\nBefore starting this section, note that the first two calls to the train function in the code below can take several hours to run. This is a common challenge when training machine learning algorithms since we have to run the algorithm for each cross validation split and each set of tuning parameters being considered. In the next section, we will provide some suggestions on how to predict the duration of the process and ways to reduce.\n\n\n\nThe first step is to optimize for \\(k\\).\n\ntrain_knn &lt;- train(x[ ,col_index], y, \n                   method = \"knn\", \n                   tuneGrid = data.frame(k = seq(3, 13, 2)))\n\nOnce we optimize our algorithm, we can fit it to the entire dataset:\n\nfit_knn &lt;- knn3(x[, col_index], y,  k = train_knn$bestTune)\n\nWe achieve a high accuracy:\n\ny_hat_knn &lt;- predict(fit_knn, x_test[, col_index], type = \"class\")\nconfusionMatrix(y_hat_knn, factor(y_test))$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.944\n\n\n31.3.1 Dimension reduction with PCA\nAn alternative to removing low variance columns directly is to use dimension reduction on the feature matrix before applying the algorithms. It is important that we not use the test set when finding the PCs nor any summary of the data, as this could result in overtraining. So we start by applying prcomp to the training data:\n\ncol_means &lt;- colMeans(x)\npca &lt;- prcomp(sweep(x, 2, col_means), center = FALSE)\n\nNext, we run kNN on just a small number of dimensions. We try 36 dimensions since this explains about 80% of the data.\n\nk &lt;- 36\nx_train &lt;- pca$x[,1:k]\ntrain_knn &lt;- train(x_train, y, \n                   method = \"knn\", \n                   tuneGrid = data.frame(k = seq(3, 13, 2)))\nfit_knn_pca &lt;- knn3(x_train, y, k = train_knn$bestTune)\n\nNow we apply the transformation we learned with the training data to the test data, reduce the dimension, and then run predict. Note that we used the rotation and column means estimated from the training data.\n\nnewdata &lt;-  sweep(x_test, 2, col_means) %*% pca$rotation[,1:k]\ny_hat_knn_pca &lt;- predict(fit_knn_pca, newdata, type = \"class\")\nconfusionMatrix(y_hat_knn_pca, factor(y_test))$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;     0.96\n\nWe obtain an improvement in accuracy, while using only 36 dimensions.\n\n\n\n\n\n\nRemember the entire algorithm needs to be developed on the training data. This is why the column means, subtracted from test set columns, and the rotation applied to obtained PCs, were both computed using the train data."
   },
   {
     "objectID": "ml/ml-in-practice.html#random-forest",
     "href": "ml/ml-in-practice.html#random-forest",
-    "title": "\n30  Machine learning in practice\n",
-    "section": "\n30.4 Random Forest",
-    "text": "30.4 Random Forest\nWith the random forest algorithm several parameters can be optimized, but the main one is mtry, the number of predictors that are randomly selected for each tree. This is also the only tuning parameter that the caret function train permits when using the default implementation from the randomForest package.\n\nlibrary(randomForest)\ngrid &lt;- data.frame(mtry = seq(3, 24, 3))\ntrain_rf &lt;-  train(x[, col_index], y, \n                   method = \"rf\", \n                   tuneGrid = grid)\n\nNow that we have optimized our algorithm, we are ready to fit our final model:\n\nfit_rf &lt;- randomForest(x[, col_index], y, mtry = train_rf$bestTune$mtry)\n\n\n#&gt; randomForest 4.7-1.1\n#&gt; Type rfNews() to see new features/changes/bug fixes.\n#&gt; \n#&gt; Attaching package: 'randomForest'\n#&gt; The following object is masked from 'package:ggplot2':\n#&gt; \n#&gt;     margin\n\nAs with kNN, we also achieve high accuracy:\n\ny_hat_rf &lt;- predict(fit_rf, x_test[ ,col_index])\nconfusionMatrix(y_hat_rf, y_test)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.954\n\nBy optimizing some of the other algorithm parameters we can achieve even higher accuracy."
+    "title": "31  Machine learning in practice",
+    "section": "\n31.4 Random Forest",
+    "text": "31.4 Random Forest\nWith the random forest algorithm several parameters can be optimized, but the main one is mtry, the number of predictors that are randomly selected for each tree. This is also the only tuning parameter that the caret function train permits when using the default implementation from the randomForest package.\n\nlibrary(randomForest)\ngrid &lt;- data.frame(mtry = seq(3, 24, 3))\ntrain_rf &lt;-  train(x[, col_index], y, \n                   method = \"rf\", \n                   tuneGrid = grid)\n\nNow that we have optimized our algorithm, we are ready to fit our final model:\n\nfit_rf &lt;- randomForest(x[, col_index], y, mtry = train_rf$bestTune$mtry)\n\nAs with kNN, we also achieve high accuracy:\n\ny_hat_rf &lt;- predict(fit_rf, x_test[ ,col_index])\nconfusionMatrix(y_hat_rf, y_test)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.955\n\nBy optimizing some of the other algorithm parameters, we can achieve even higher accuracy."
   },
   {
     "objectID": "ml/ml-in-practice.html#testing-and-improving-computation-time",
     "href": "ml/ml-in-practice.html#testing-and-improving-computation-time",
-    "title": "\n30  Machine learning in practice\n",
-    "section": "\n30.5 Testing and improving computation time",
-    "text": "30.5 Testing and improving computation time\nThe default method for estimating accuracy used by the train function is to test prediction on 25 bootstrap samples. This can result in long compute times. For examples, if we are considering several values, say 10, of the tuning parameters, we will fit the algorithm 250 times. We can use the system.time function to estimate the how long it takes to run the algorithm once\n\nsystem.time(fit_rf &lt;- randomForest(x[, col_index], y,  mtry = 9))\n#&gt;    user  system elapsed \n#&gt;   60.61    0.59   61.25\n\nand use this to estimate the total time for the 250 iterations. In this case it will be several hours.\nOne way to reduce run time is to use k-fold cross validation with a smaller number of test sets. A popular choice is leaving out 10 test sets with 10% of the data:\n\ncontrol &lt;- trainControl(method = \"cv\", number = 10, p = .9)\n\nand re-running the train function with this choice specified via the trControl argument:\n\ntrain_rf &lt;-  train(x[, col_index], y, \n                   method = \"rf\", \n                   tuneGrid = grid,\n                   trControl = control)\n\nFor random forest we can also speed up the training step by running less trees per fit. After running the algorithm once, we can use the plot function to see how the error rate changes as the number of trees grows. Here we\n\nplot(fit_rf)\n\n\n\n\n\n\n\nWe can see that error rate stabilizes after about 200 trees. We can use this finding to speed up the cross validation procedure. Specifically, because the default is 500, by adding the argument ntree = 200 to the call to train above, the procedure will finish 2.5 times faster."
+    "title": "31  Machine learning in practice",
+    "section": "\n31.5 Testing and improving computation time",
+    "text": "31.5 Testing and improving computation time\nThe default method for estimating accuracy used by the train function is to test prediction on 25 bootstrap samples. This can result in long compute times. For example, if we are considering several values, say 10, of the tuning parameters, we will fit the algorithm 250 times. We can use the system.time function to estimate how long it takes to run the algorithm once:\n\nsystem.time({fit_rf &lt;- randomForest(x[, col_index], y,  mtry = 9)})\n#&gt;    user  system elapsed \n#&gt;  59.707   0.046  59.791\n\nand use this to estimate the total time for the 250 iterations. In this case it will be several hours.\nOne way to reduce run time is to use k-fold cross validation with a smaller number of test sets. A popular choice is leaving out 10 test sets with 10% of the data:\n\ncontrol &lt;- trainControl(method = \"cv\", number = 10, p = .9)\n\nand re-running the train function with this choice specified via the trControl argument:\n\ntrain_rf &lt;-  train(x[, col_index], y, \n                   method = \"rf\", \n                   tuneGrid = grid,\n                   trControl = control)\n\nFor random forest, we can also speed up the training step by running less trees per fit. After running the algorithm once, we can use the plot function to see how the error rate changes as the number of trees grows.\nHere we can see that error rate stabilizes after about 200 trees:\n\nplot(fit_rf)\n\n\n\n\n\n\n\nWe can use this finding to speed up the cross validation procedure. Specifically, because the default is 500, by adding the argument ntree = 200 to the call to train above, the procedure will finish 2.5 times faster."
   },
   {
     "objectID": "ml/ml-in-practice.html#variable-importance",
     "href": "ml/ml-in-practice.html#variable-importance",
-    "title": "\n30  Machine learning in practice\n",
-    "section": "\n30.6 Variable importance",
-    "text": "30.6 Variable importance\nThe following function computes the importance of each feature:\n\nimp &lt;- importance(fit_rf)\n\nWe can see which features are being used most by plotting an image:\n\nmat &lt;- rep(0, ncol(x))\nmat[col_index] &lt;- imp\nimage(matrix(mat, 28, 28))\n\n\nrafalib::mypar()\nmat &lt;- rep(0, ncol(x))\nmat[col_index] &lt;- imp\nimage(matrix(mat, 28, 28))"
+    "title": "31  Machine learning in practice",
+    "section": "\n31.6 Variable importance",
+    "text": "31.6 Variable importance\nThe following function computes the importance of each feature:\n\nimp &lt;- importance(fit_rf)\n\nWe can see which features are being used most by plotting an image:\n\nmat &lt;- rep(0, ncol(x))\nmat[col_index] &lt;- imp\nimage(matrix(mat, 28, 28))"
   },
   {
-    "objectID": "ml/ml-in-practice.html#visual-assessments",
-    "href": "ml/ml-in-practice.html#visual-assessments",
-    "title": "\n30  Machine learning in practice\n",
-    "section": "\n30.7 Visual assessments",
-    "text": "30.7 Visual assessments\nAn important part of data analysis is visualizing results to determine why we are failing. How we do this depends on the application. Below we show the images of digits for which we made an incorrect prediction. Here are some errors for the random forest:\n\n\n\n\n\n\n\n\nBy examining errors like this we often find specific weaknesses to algorithms or parameter choices and can try to correct them."
+    "objectID": "ml/ml-in-practice.html#diagnostics",
+    "href": "ml/ml-in-practice.html#diagnostics",
+    "title": "31  Machine learning in practice",
+    "section": "\n31.7 Diagnostics",
+    "text": "31.7 Diagnostics\nAn important part of data analysis is visualizing results to determine why we are failing. How we do this depends on the application. Below we show the images of digits for which we made an incorrect prediction. Here are some errors for the random forest:\n\n\n\n\n\n\n\n\nBy examining errors like this, we often find specific weaknesses to algorithms or parameter choices and can try to correct them."
   },
   {
     "objectID": "ml/ml-in-practice.html#ensembles",
     "href": "ml/ml-in-practice.html#ensembles",
-    "title": "\n30  Machine learning in practice\n",
-    "section": "\n30.8 Ensembles",
-    "text": "30.8 Ensembles\nThe idea of an ensemble is similar to the idea of combining data from different pollsters to obtain a better estimate of the true support for each candidate.\nIn machine learning, one can usually greatly improve the final results by combining the results of different algorithms.\nHere is a simple example where we compute new class probabilities by taking the average of random forest and kNN. We can see that the accuracy improves to 0.96:\n\np_rf &lt;- predict(fit_rf, x_test[,col_index], type = \"prob\")  \np_rf &lt;- p_rf / rowSums(p_rf)\np_knn  &lt;- predict(fit_knn, x_test[,col_index])\np &lt;- (p_rf + p_knn)/2\ny_pred &lt;- factor(apply(p, 1, which.max) - 1)\nconfusionMatrix(y_pred, y_test)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.954\n\nIn the exercises we are going to build several machine learning models for the mnist_27 dataset and then build an ensemble."
+    "title": "31  Machine learning in practice",
+    "section": "\n31.8 Ensembles",
+    "text": "31.8 Ensembles\nThe idea of an ensemble is similar to the idea of combining data from different pollsters to obtain a better estimate of the true support for each candidate.\nIn machine learning, one can usually greatly improve the final results by combining the results of different algorithms.\nHere is a simple example where we compute new class probabilities by taking the average of random forest and kNN. We can see that the accuracy improves:\n\np_rf &lt;- predict(fit_rf, x_test[,col_index], type = \"prob\")  \np_rf &lt;- p_rf / rowSums(p_rf)\np_knn_pca  &lt;- predict(fit_knn_pca, newdata)\np &lt;- (p_rf + p_knn_pca)/2\ny_pred &lt;- factor(apply(p, 1, which.max) - 1)\nconfusionMatrix(y_pred, y_test)$overall[\"Accuracy\"]\n#&gt; Accuracy \n#&gt;    0.967\n\nWe have just built an ensemble with just two algorithms. By combing more similarly performing, but uncorrelated, algorithms we can improve accuracy further."
   },
   {
     "objectID": "ml/ml-in-practice.html#exercises",
     "href": "ml/ml-in-practice.html#exercises",
-    "title": "\n30  Machine learning in practice\n",
-    "section": "\n30.9 Exercises",
-    "text": "30.9 Exercises\n1. Previously in the book, we have compared conditional probability give two predictors \\(p(x_1,x_2)\\) to the fit \\(\\hat{p}(x_1,x_2)\\) obtained with a machine learning algorithm by making image plots. The following code can be used to make these images and include a curve at the values of \\(x_1\\) and \\(x_2\\) for which the function is \\(0.5\\).\n\nplot_cond_prob &lt;- function(x_1, x_2, p){\n  data.frame(x_1 = x_1, x_2 = x_2, p = p) |&gt;\n    ggplot(aes(x_1, x_2)) +\n    geom_raster(aes(fill = p), show.legend = FALSE) +\n    stat_contour(aes(z = p), breaks = 0.5, color = \"black\") +\n    scale_fill_gradientn(colors = c(\"#F8766D\", \"white\", \"#00BFC4\"))\n}\n\nWe can see the true conditional probability for the 2 or 7 example like this:\n\nwith(mnist_27$true_p, plot_cond_prob(x_1, x_2, p))\n\nFit a kNN model and make this plot for the estimated conditional probability. Hint: Use the argument newdata=mnist27$train to obtain predictions for a grid points.\n2. Notice that, in the plot made in exercise 1, the boundary is somewhat wiggly. This is because kNN, like the basic bin smoother, does not use a kernel. To improve this we could try loess. By reading through the available models part of the manual4 we see that we can use the gamLoess method. In the manual5 we also see that we need to install the gam package if we have not done so already. We see that we have two parameters to optimize:\n\nmodelLookup(\"gamLoess\")\n#&gt;      model parameter  label forReg forClass probModel\n#&gt; 1 gamLoess      span   Span   TRUE     TRUE      TRUE\n#&gt; 2 gamLoess    degree Degree   TRUE     TRUE      TRUE\n\nUse cross-validation to pick a span between 0.15 and 0.75. Keep degree = 1. What span does cross-validation select?\n3. Show an image plot of the estimate \\(\\hat{p}(x,y)\\) resulting from the model fit in the exercise 2. How does the accuracy compare to that of kNN? Comment on the difference between the estimate obtained with kNN.\n4. Use the mnist_27 training set to build a model with several of the models available from the caret package. For example, you can try these:\n\nmodels &lt;- c(\"glm\", \"lda\",  \"naive_bayes\",  \"svmLinear\", \"gamboost\",  \n            \"gamLoess\", \"qda\", \"knn\", \"kknn\", \"loclda\", \"gam\", \"rf\", \n            \"ranger\",\"wsrf\", \"Rborist\", \"avNNet\", \"mlp\", \"monmlp\", \"gbm\", \n            \"adaboost\", \"svmRadial\", \"svmRadialCost\", \"svmRadialSigma\")\n\nWe have not explained many of these, but apply them anyway using train with all the default parameters. Keep the results in a list. You might need to install some packages. Keep in mind that you will likely get some warnings.\n5. Now that you have all the trained models in a list, use sapply or map to create a matrix of predictions for the test set. You should end up with a matrix with length(mnist_27$test$y) rows and length(models) columns.\n6. Now compute accuracy for each model on the test set.\n7. Now build an ensemble prediction by majority vote and compute its accuracy.\n8. Earlier we computed the accuracy of each method on the training set and noticed they varied. Which individual methods do better than the ensemble?\n9. It is tempting to remove the methods that do not perform well and re-do the ensemble. The problem with this approach is that we are using the test data to make a decision. However, we could use the accuracy estimates obtained from cross validation with the training data. Obtain these estimates and save them in an object.\n10. Now let’s only consider the methods with an estimated accuracy of 0.8 when constructing the ensemble. What is the accuracy now?\n11. Advanced: If two methods give results that are the same, ensembling them will not change the results at all. For each pair of metrics compare the percent of time they call the same thing. Then use the heatmap function to visualize the results. Hint: use the method = \"binary\" argument in the dist function.\n12. Advanced: Note that each method can also produce an estimated conditional probability. Instead of majority vote we can take the average of these estimated conditional probabilities. For most methods, we can the use the type = \"prob\" in the train function. However, some of the methods require you to use the argument trControl=trainControl(classProbs=TRUE) when calling train. Also these methods do not work if classes have numbers as names. Hint: change the levels like this:\n\ndat$train$y &lt;- recode_factor(dat$train$y, \"2\"=\"two\", \"7\"=\"seven\")\ndat$test$y &lt;- recode_factor(dat$test$y, \"2\"=\"two\", \"7\"=\"seven\")\n\n13. In this chapter, we illustrated a couple of machine learning algorithms on a subset of the MNIST dataset. Try fitting a model to the entire dataset."
+    "title": "31  Machine learning in practice",
+    "section": "\n31.9 Exercises",
+    "text": "31.9 Exercises\n1. In the exercises in Chapter 30 we saw that changing maxnodes or nodesize in the randomForest function improved our estimate. Let’s use the train function to help us pick these values. From the caret manual we see that we can’t tune the maxnodes parameter or the nodesize argument with randomForest, so we will use the Rborist package and tune the minNode argument. Use the train function to try values minNode &lt;- seq(5, 250, 25). See which value minimizes the estimated RMSE.\n2. This **dslabs* dataset includes a matrix x:\n\nlibrary(dslabs)\ndim(tissue_gene_expression$x)\n\nwith the gene expression measured on 500 genes for 189 biological samples representing seven different tissues. The tissue type is stored in y:\n\ntable(tissue_gene_expression$y)\n\nSplit the data in training and test sets, then use kNN to predict tissue type and see what accuracy you obtain. Try it for \\(k = 1, 3, \\dots, 11\\).\n3. We are going to apply LDA and QDA to the tissue_gene_expression dataset. We will start with simple examples based on this dataset and then develop a realistic example.\nCreate a dataset with just the classes cerebellum and hippocampus (two parts of the brain) and a predictor matrix with 10 randomly selected columns. Estimate the accuracy of LDA.\n\nset.seed(1993)\ntissues &lt;- c(\"cerebellum\", \"hippocampus\")\nind &lt;- which(tissue_gene_expression$y %in% tissues)\ny &lt;- droplevels(tissue_gene_expression$y[ind])\nx &lt;- tissue_gene_expression$x[ind, ]\nx &lt;- x[, sample(ncol(x), 10)]\n\n4. In this case, LDA fits two 10-dimensional normal distributions. Look at the fitted model by looking at the finalModel component of the result of train. Notice there is a component called means that includes the estimate means of both distributions. Plot the mean vectors against each other and determine which predictors (genes) appear to be driving the algorithm.\n5. Repeat exercises 3 with QDA. Does it have a higher accuracy than LDA?\n6. Are the same predictors (genes) driving the algorithm? Make a plot as in exercise 3.\n7. One thing we see in the previous plot is that the value of predictors correlate in both groups: some predictors are low in both groups while others are high in both groups. The mean value of each predictor, colMeans(x), is not informative or useful for prediction and often, for interpretation purposes, it is useful to center or scale each column. This can be achieved with the preProcessing argument in train. Re-run LDA with preProcessing = \"scale\". Note that accuracy does not change but see how it is easier to identify the predictors that differ more between groups in the plot made in exercise 4.\n8. In the previous exercises, we saw that both approaches worked well. Plot the predictor values for the two genes with the largest differences between the two groups in a scatterplot to see how they appear to follow a bivariate distribution as assumed by the LDA and QDA approaches. Color the points by the outcome.\n9. Now we are going to increase the complexity of the challenge slightly: we will consider all the tissue types.\n\nset.seed(1993)\ny &lt;- tissue_gene_expression$y\nx &lt;- tissue_gene_expression$x\nx &lt;- x[, sample(ncol(x), 10)]\n\nWhat accuracy do you get with LDA?\n10. We see that the results are slightly worse. Use the confusionMatrix function to learn what type of errors we are making.\n11. Plot an image of the centers of the seven 10-dimensional normal distributions.\n12. Make a scatterplot along with the prediction from the best fitted model.\n13. Use the rpart function to fit a classification tree to the tissue_gene_expression dataset. Use the train function to estimate the accuracy. Try out cp values of seq(0, 0.05, 0.01). Plot the accuracy to report the results of the best model.\n14. Study the confusion matrix for the best fitting classification tree. What do you observe happening for placenta?\n15. Notice that placentas are called endometrium more often than placenta. Note also that the number of placentas is just six, and that, by default, rpart requires 20 observations before splitting a node. Thus it is not possible with these parameters to have a node in which placentas are the majority. Rerun the above analysis, but this time permit rpart to split any node by using the argument control = rpart.control(minsplit = 0). Does the accuracy increase? Look at the confusion matrix again.\n16. Plot the tree from the best fitting model obtained in exercise 11.\n17. We can see that with just six genes, we are able to predict the tissue type. Now let’s see if we can do even better with a random forest. Use the train function and the rf method to train a random forest. Try out values of mtry ranging from, at least, seq(50, 200, 25). What mtry value maximizes accuracy? To permit small nodesize to grow as we did with the classification trees, use the following argument: nodesize = 1. This will take several seconds to run. If you want to test it out, try using smaller values with ntree. Set the seed to 1990.\n18. Use the function varImp on the output of train and save it to an object called imp.\n19. The rpart model we ran above produced a tree that used just six predictors. Extracting the predictor names is not straightforward, but can be done. If the output of the call to train was fit_rpart, we can extract the names like this:\n\nind &lt;- !(fit_rpart$finalModel$frame$var == \"&lt;leaf&gt;\")\ntree_terms &lt;- \n  fit_rpart$finalModel$frame$var[ind] |&gt;\n  unique() |&gt;\n  as.character()\ntree_terms\n\nWhat is the variable importance in the random forest call for these predictors? Where do they rank?\n20. Extract the top 50 predictors based on importance, take a subset of x with just these predictors and apply the function heatmap to see how these genes behave across the tissues. We will introduce the heatmap function in Chapter 32.\n21. Previously, we compared the conditional probability \\(p(\\mathbf{x})\\) give two predictors \\(\\mathbf{x} = (x_1, x_2)^\\top\\) to the fit \\(\\hat{p}(\\mathbf{x})\\) obtained with a machine learning algorithm by making image plots. The following code can be used to make these images and include a curve at the values of \\(x_1\\) and \\(x_2\\) for which the function is \\(0.5\\):\n\nplot_cond_prob &lt;- function(x_1, x_2, p){\n  data.frame(x_1 = x_1, x_2 = x_2, p = p) |&gt;\n    ggplot(aes(x_1, x_2)) +\n    geom_raster(aes(fill = p), show.legend = FALSE) +\n    stat_contour(aes(z = p), breaks = 0.5, color = \"black\") +\n    scale_fill_gradientn(colors = c(\"#F8766D\", \"white\", \"#00BFC4\"))\n}\n\nWe can see the true conditional probability for the 2 or 7 example like this:\n\nwith(mnist_27$true_p, plot_cond_prob(x_1, x_2, p))\n\nFit a kNN model and make this plot for the estimated conditional probability. Hint: Use the argument newdata = mnist27$train to obtain predictions for a grid points.\n22. Notice that, in the plot made in exercise 1, the boundary is somewhat wiggly. This is because kNN, like the basic bin smoother, does not use a kernel. To improve this we could try loess. By reading through the available models part of the caret manual, we see that we can use the gamLoess method. We need to install the gam package, if we have not done so already. We see that we have two parameters to optimize:\n\nmodelLookup(\"gamLoess\")\n#&gt;      model parameter  label forReg forClass probModel\n#&gt; 1 gamLoess      span   Span   TRUE     TRUE      TRUE\n#&gt; 2 gamLoess    degree Degree   TRUE     TRUE      TRUE\n\nUse cross-validation to pick a span between 0.15 and 0.75. Keep degree = 1. What span does cross-validation select?\n23. Show an image plot of the estimate \\(\\hat{p}(x,y)\\) resulting from the model fit in the exercise 2. How does the accuracy compare to that of kNN? Comment on the difference between the estimate obtained with kNN.\n24. Use the mnist_27 training set to build a model with several of the models available from the caret package. For example, you can try these:\n\nmodels &lt;- c(\"glm\", \"lda\",  \"naive_bayes\",  \"svmLinear\", \"gamboost\",  \n            \"gamLoess\", \"qda\", \"knn\", \"kknn\", \"loclda\", \"gam\", \"rf\", \n            \"ranger\",\"wsrf\", \"Rborist\", \"avNNet\", \"mlp\", \"monmlp\", \"gbm\", \n            \"adaboost\", \"svmRadial\", \"svmRadialCost\", \"svmRadialSigma\")\n\nWe have not explained many of these, but apply them anyway using train with all the default parameters. Keep the results in a list. You might need to install some packages. Keep in mind that you will likely get some warnings.\n25. Now that you have all the trained models in a list, use sapply or map to create a matrix of predictions for the test set. You should end up with a matrix with length(mnist_27$test$y) rows and length(models) columns.\n26. Compute accuracy for each model on the test set.\n27. Build an ensemble prediction by majority vote and compute its accuracy.\n28. Earlier we computed the accuracy of each method on the training set and noticed they varied. Which individual methods do better than the ensemble?\n29. It is tempting to remove the methods that do not perform well and re-do the ensemble. The problem with this approach is that we are using the test data to make a decision. However, we could use the accuracy estimates obtained from cross validation with the training data. Obtain these estimates and save them in an object.\n30. Now let’s only consider the methods with an estimated accuracy of 0.8 when constructing the ensemble. What is the accuracy now?\n31. Note that if two machine algorithms methods predict the same outcome, ensembling them will not change the prediction. For each pair of algorithms compare the percent of observations for which they make the same prediction. Use this to define a function and then use the heatmap function to visualize the results. Hint: use the method = \"binary\" argument in the dist function.\n32. Note that each method can also produce an estimated conditional probability. Instead of majority vote, we can take the average of these estimated conditional probabilities. For most methods, we can the use the type = \"prob\" in the train function. Note that some of the methods require you to use the argument trControl=trainControl(classProbs=TRUE) when calling train. Also, these methods do not work if classes have numbers as names. Hint: change the levels like this:\n\ndat$train$y &lt;- recode_factor(dat$train$y, \"2\"=\"two\", \"7\"=\"seven\")\ndat$test$y &lt;- recode_factor(dat$test$y, \"2\"=\"two\", \"7\"=\"seven\")\n\n33. In this chapter, we illustrated a couple of machine learning algorithms on a subset of the MNIST dataset. Try fitting a model to the entire dataset."
   },
   {
     "objectID": "ml/ml-in-practice.html#footnotes",
     "href": "ml/ml-in-practice.html#footnotes",
-    "title": "\n30  Machine learning in practice\n",
+    "title": "31  Machine learning in practice",
     "section": "",
-    "text": "https://topepo.github.io/caret/available-models.html↩︎\nhttp://topepo.github.io/caret/available-models.html↩︎\nhttps://topepo.github.io/caret/available-models.html↩︎\nhttps://topepo.github.io/caret/available-models.html↩︎\nhttps://topepo.github.io/caret/train-models-by-tag.html↩︎"
+    "text": "https://topepo.github.io/caret/available-models.html↩︎\nhttp://topepo.github.io/caret/available-models.html↩︎\nhttps://topepo.github.io/caret/available-models.html↩︎"
   },
   {
     "objectID": "ml/clustering.html#hierarchical-clustering",
     "href": "ml/clustering.html#hierarchical-clustering",
-    "title": "31  Clustering",
-    "section": "\n31.1 Hierarchical clustering",
-    "text": "31.1 Hierarchical clustering\nWith the distance between each pair of movies computed, we need an algorithm to define groups from these. Hierarchical clustering starts by defining each observation as a separate group, then the two closest groups are joined into a group iteratively until there is just one group including all the observations. The hclust function implements this algorithm and it takes a distance as input.\n\nh &lt;- hclust(d)\n\nWe can see the resulting groups using a dendrogram.\n\nplot(h, cex = 0.65, main = \"\", xlab = \"\")\n\n\n\n\n\n\n\n\n\nThis graph gives us an approximation between the distance between any two movies. To find this distance we find the first location, from top to bottom, where these movies split into two different groups. The height of this location is the distance between these two groups. So, for example, the distance between the three Star Wars movies is 8 or less, while the distance between Raiders of the Lost of Ark and Silence of the Lambs is about 17.\nTo generate actual groups we can do one of two things: 1) decide on a minimum distance needed for observations to be in the same group or 2) decide on the number of groups you want and then find the minimum distance that achieves this. The function cutree can be applied to the output of hclust to perform either of these two operations and generate groups.\n\ngroups &lt;- cutree(h, k = 10)\n\nNote that the clustering provides some insights into types of movies. Group 4 appears to be blockbusters:\n\nnames(groups)[groups == 4]\n#&gt; [1] \"Braveheart\"        \"Godfather, The\"    \"Good Will Hunting\"\n\nAnd group 9 appears to be nerd movies:\n\nnames(groups)[groups == 9]\n#&gt; [1] \"True Lies\"            \"Fugitive, The\"        \"Groundhog Day\"       \n#&gt; [4] \"Men in Black (a.k...\"\n\nWe can change the size of the group by either making k larger or h smaller. We can also explore the data to see if there are clusters of movie raters.\n\nh_2 &lt;- dist(t(x)) |&gt; hclust()"
+    "title": "32  Clustering",
+    "section": "\n32.1 Hierarchical clustering",
+    "text": "32.1 Hierarchical clustering\nWith the distance between each pair of movies computed, we need an algorithm to define groups, based on these distances. Hierarchical clustering starts by defining each observation as a separate group, then the two closest groups are joined into new groups. We then continue joining the closest groups into new groups iteratively until there is just one group including all the observations. The hclust function implements this algorithm and takes a distance as input.\n\nh &lt;- hclust(d)\n\nWe can see the resulting groups using a dendrogram. The funciton plot applied to an hclust object creates a dendrogram:\n\nplot(h, cex = 0.65, main = \"\", xlab = \"\")\n\n\n\n\n\n\n\n\n\nThis graph gives us an approximation between the distance between any two movies. To find this distance, we find the first location, from top to bottom, where these movies split into two different groups. The height of this location is the distance between these two groups. So, for example, the distance between the three Star Wars movies is 8 or less, while the distance between Raiders of the Lost of Ark and Silence of the Lambs is about 17.\nTo generate actual groups, we can do one of two things: 1) decide on a minimum distance needed for observations to be in the same group or 2) decide on the number of groups you want and then find the minimum distance that achieves this. The function cutree can be applied to the output of hclust to perform either of these two operations and generate groups.\n\ngroups &lt;- cutree(h, k = 10)\n\nNote that the clustering provides some insights into types of movies. Group 4 appears to be blockbusters:\n\nnames(groups)[groups == 4]\n#&gt; [1] \"Braveheart\"        \"Godfather, The\"    \"Good Will Hunting\"\n\nAnd Group 9 appears to be nerd movies:\n\nnames(groups)[groups == 6]\n#&gt; [1] \"Star Wars IV - A ...\" \"Star Wars V - The...\" \"Raiders of the Lo...\"\n#&gt; [4] \"Star Wars VI - Re...\" \"Lord of the Rings...\" \"Lord of the Rings...\"\n#&gt; [7] \"Lord of the Rings...\"\n\nWe can change the size of the group by either making k larger or h smaller.\nWe can also explore the data to see if there are clusters of movie raters:\n\nh_2 &lt;- hclust(dist(t(x)))"
   },
   {
     "objectID": "ml/clustering.html#k-means",
     "href": "ml/clustering.html#k-means",
-    "title": "31  Clustering",
-    "section": "\n31.2 k-means",
-    "text": "31.2 k-means\nTo use the k-means clustering algorithm we have to pre-define \\(k\\), the number of clusters we want to define. The k-means algorithm is iterative. The first step is to define \\(k\\) centers. Then each observation is assigned to the cluster with the closest center to that observation. In a second step the centers are redefined using the observation in each cluster: the column means are used to define a centroid. We repeat these two steps until the centers converge.\nThe kmeans function included in R-base does not handle NAs. For illustrative purposes we will fill out the NAs with 0s. In general, the choice of how to fill in missing data, or if one should do it at all, should be made with care.\n\nx_0 &lt;- x\nx_0[is.na(x_0)] &lt;- 0\nk &lt;- kmeans(x_0, centers = 10)\n\nThe cluster assignments are in the cluster component:\n\ngroups &lt;- k$cluster\n\nNote that because the first center is chosen at random, the final clusters are random. We impose some stability by repeating the entire function several times and averaging the results. The number of random starting values to use can be assigned through the nstart argument.\n\nk &lt;- kmeans(x_0, centers = 10, nstart = 25)"
+    "title": "32  Clustering",
+    "section": "\n32.2 k-means",
+    "text": "32.2 k-means\nTo use the k-means clustering algorithm we have to pre-define \\(k\\), the number of clusters we want to define. The k-means algorithm is iterative.\nThe first step is to define \\(k\\) centers. Then each observation is assigned to the cluster with the closest center to that observation. In a second step, the centers are redefined using the observation in each cluster: the column means are used to define a centroid. We repeat these two steps until the centers converge.\nThe kmeans function included in R-base does not handle NAs. For illustrative purposes, we will fill out the NAs with 0s. In general, the choice of how to fill in missing data, or if one should do it at all, should be made with care.\n\nx_0 &lt;- x\nx_0[is.na(x_0)] &lt;- 0\nk &lt;- kmeans(x_0, centers = 10)\n\nThe cluster assignments are in the cluster component:\n\ngroups &lt;- k$cluster\n\nNote that because the first center is chosen at random, the final clusters are random. We impose some stability by repeating the entire function several times and averaging the results. The number of random starting values to use can be assigned through the nstart argument.\n\nk &lt;- kmeans(x_0, centers = 10, nstart = 25)"
   },
   {
     "objectID": "ml/clustering.html#heatmaps",
     "href": "ml/clustering.html#heatmaps",
-    "title": "31  Clustering",
-    "section": "\n31.3 Heatmaps",
-    "text": "31.3 Heatmaps\nA powerful visualization tool for discovering clusters or patterns in your data is the heatmap. The idea is simple: plot an image of your data matrix with colors used as the visual cue and both the columns and rows ordered according to the results of a clustering algorithm. We will demonstrate this with the tissue_gene_expression dataset. We will scale the rows of the gene expression matrix.\nThe first step is compute:\n\nx &lt;- sweep(tissue_gene_expression$x, 2, colMeans(tissue_gene_expression$x))\nh_1 &lt;- hclust(dist(x))\nh_2 &lt;- hclust(dist(t(x)))\n\nNow we can use the results of this clustering to order the rows and columns.\n\nimage(x[h_1$order, h_2$order])\n\nBut there is heatmap function that does it for us:\n\nheatmap(x, col = RColorBrewer::brewer.pal(11, \"Spectral\"))\n\nWe do not show the results of the heatmap function because there are too many features for the plot to be useful. We will therefore filter some columns and remake the plots."
-  },
-  {
-    "objectID": "ml/clustering.html#filtering-features",
-    "href": "ml/clustering.html#filtering-features",
-    "title": "31  Clustering",
-    "section": "\n31.4 Filtering features",
-    "text": "31.4 Filtering features\nIf the information about clusters is included in just a few features, including all the features can add enough noise that detecting clusters becomes challenging. One simple approach to try to remove features with no information is to only include those with high variance. In the movie example, a user with low variance in their ratings is not really informative: all the movies seem about the same to them. Here is an example of how we can include only the features with high variance.\n\nlibrary(matrixStats)\nsds &lt;- colSds(x, na.rm = TRUE)\no &lt;- order(sds, decreasing = TRUE)[1:25]\nheatmap(x[,o], col = RColorBrewer::brewer.pal(11, \"Spectral\"))"
+    "title": "32  Clustering",
+    "section": "\n32.3 Heatmaps",
+    "text": "32.3 Heatmaps\nA powerful visualization tool for discovering clusters or patterns in your data is the heatmap. The idea is simple: plot an image of your data matrix with colors used as the visual cue and both the columns and rows ordered according to the results of a clustering algorithm. We will demonstrate this with the tissue_gene_expression dataset.\nWe start by scaling the columns of the gene expression matrix because we only care about relative differences in gene expression. After scaling, we compute perform clustering on both the observations and the predictors:\n\nx &lt;- sweep(tissue_gene_expression$x, 2, colMeans(tissue_gene_expression$x))\nh_1 &lt;- hclust(dist(x))\nh_2 &lt;- hclust(dist(t(x)))\n\nNow we can use the results of this clustering to order the rows and columns:\n\nimage(x[h_1$order, h_2$order])\n\nThe heatmap function that does all this for us:\n\nheatmap(x, col = RColorBrewer::brewer.pal(11, \"Spectral\"))\n\nNote we do not show the results of the heatmap function because there are too many features for the plot to be useful. We will therefore filter some columns and remake the plots.\n\n32.3.1 Filtering features\nIf only a few features are different between clusters, including all the features can add enough noise that making cluster detection challenging. A simple approach to avoid this is to assume low variability features are not informative and include only high variance features. For example, in the movie example, users with low variance in their ratings are not really distinguishing movies: all the movies seem about the same to them.\nHere is an example code showing how we can include only the features with high variance in a heatmap:\n\nlibrary(matrixStats)\nsds &lt;- colSds(x, na.rm = TRUE)\no &lt;- order(sds, decreasing = TRUE)[1:25]\nheatmap(x[,o], col = RColorBrewer::brewer.pal(11, \"Spectral\"))\n\n\n\n\n\n\n\nNote there are several other heatmap functions in R. A popular example is the heatmap.2 in the gplots package."
   },
   {
     "objectID": "ml/clustering.html#exercises",
     "href": "ml/clustering.html#exercises",
-    "title": "31  Clustering",
-    "section": "\n31.5 Exercises",
-    "text": "31.5 Exercises\n1. Load the tissue_gene_expression dataset. Remove the row means and compute the distance between each observation. Store the result in d.\n2. Make a hierarchical clustering plot and add the tissue types as labels.\n3. Run a k-means clustering on the data with \\(K=7\\). Make a table comparing the identified clusters to the actual tissue types. Run the algorithm several times to see how the answer changes.\n4. Select the 50 most variable genes. Make sure the observations show up in the columns, that the predictors are centered, and add a color bar to show the different tissue types. Hint: use the ColSideColors argument to assign colors. Also, use col = RColorBrewer::brewer.pal(11, \"RdBu\") for a better use of colors."
-  },
-  {
-    "objectID": "highdim/dimension-reduction.html#examples",
-    "href": "highdim/dimension-reduction.html#examples",
-    "title": "\n21  Dimension reduction\n",
-    "section": "\n21.6 Examples",
-    "text": "21.6 Examples\n\n21.6.1 Iris example\nThe iris data is a widely used example in data analysis courses. It includes four botanical measurements related to three flower species:\n\nnames(iris)\n#&gt; [1] \"Sepal.Length\" \"Sepal.Width\"  \"Petal.Length\" \"Petal.Width\" \n#&gt; [5] \"Species\"\n\nIf you print iris$Species you will see that the data is ordered by the species.\nIf we visualize the distances we can clearly see the three species with one species very different from the other two:\n\nx &lt;- iris[,1:4] |&gt; as.matrix()\nd &lt;- dist(x)\nimage(as.matrix(d), col = rev(RColorBrewer::brewer.pal(9, \"RdBu\")))\n\n\n\n\n\n\n\n\n\nOur features matrix has four dimensions, but three are very correlated:\n\ncor(x)\n#&gt;              Sepal.Length Sepal.Width Petal.Length Petal.Width\n#&gt; Sepal.Length        1.000      -0.118        0.872       0.818\n#&gt; Sepal.Width        -0.118       1.000       -0.428      -0.366\n#&gt; Petal.Length        0.872      -0.428        1.000       0.963\n#&gt; Petal.Width         0.818      -0.366        0.963       1.000\n\nIf we apply PCA, we should be able to approximate this distance with just two dimensions, compressing the highly correlated dimensions. Using the summary function we can see the variability explained by each PC:\n\npca &lt;- prcomp(x)\nsummary(pca)\n#&gt; Importance of components:\n#&gt;                          PC1    PC2    PC3     PC4\n#&gt; Standard deviation     2.056 0.4926 0.2797 0.15439\n#&gt; Proportion of Variance 0.925 0.0531 0.0171 0.00521\n#&gt; Cumulative Proportion  0.925 0.9777 0.9948 1.00000\n\nThe first two dimensions account for almot 98% of the variability. Thus we should be able to approximate the distance very well with two dimensions. We confirm this by computing the distance from first two dimensions and comparing to the original:\n\nd_approx &lt;- dist(pca$x[, 1:2])\nplot(d, d_approx); abline(0, 1, col = \"red\")\n\n\n\n\n\n\n\n\n\nA useful application of this result is that we can now visualize the distance between each observation with a two-dimensional plot.\n\ndata.frame(pca$x[,1:2], Species = iris$Species) |&gt;\n  ggplot(aes(PC1, PC2, fill = Species)) +\n  geom_point(cex = 3, pch = 21) +\n  coord_fixed(ratio = 1)\n\n\n\n\n\n\n\nWe color the the observation by their labels and notice that with these two dimensions we achieve almost perfect separation.\nLooking more closely at the resulting PCs and rotaions:\n\n\n\n\n\n\n\n\nwe learn that the first PC is obtained by taking a weighted avarge of sepal length, petal length, and petal width, since these are red in first column, and subtracts a weighted sepal width, since this is blue. The second PC is a weighted average of weighted average of petal length and petal width minus a weighted average of sepal length and petal width.\n\n21.6.2 MNIST example\nThe written digits example has 784 features. Is there any room for data reduction? We will use PCA to answer this.\nLet’s load the data if not already loaded:\n\nlibrary(dslabs)\nif (!exists(\"mnist\")) mnist &lt;- read_mnist()\n\nBecause the pixels are so small, we expect pixels close to each other on the grid to be correlated, meaning that dimension reduction should be possible.\nLet’s compute the PCs. This will take a few seconds as it is a rather large matrix.\n\npca &lt;- prcomp(mnist$train$images)\n\n\n\n\n\n\n\n\n\n\nplot(pca$sdev^2/sum(pca$sdev^2), xlab = \"PC\", ylab = \"Variance explained\")\n\nWe can see that the first few PCs already explain a large percent of the variability:\nAnd just by looking at the first two PCs we already see information about the labels. Here is a random sample of 500 digits:\n\ndata.frame(PC1 = pca$x[,1], PC2 = pca$x[,2], label = factor(mnist$train$label)) |&gt;\n  sample_n(500) |&gt;\n  ggplot(aes(PC1, PC2, fill = label)) +\n  geom_point(cex = 3, pch = 21)\n\n\n\n\n\n\n\nWe can also see the linear combinations on the grid to get an idea of how pixels are getting to compute the first four principal components:\n\n\n\n\n\n\n\n\nWe can clearly see that first PC appears to be separating the 1s (red) from the 0s (blue). We can kind of make out numbers in the other three PCs as well. By looking at the PCs stratified by digit we get futher insights. For example, we see that the second PC separates 4s, 7s, and 9s from the rest:\n\n\n\n\n\n\n\n\nWe can also confirm that the lower variance PCs appear related to unimportant variability, mainly smudges in the corners:"
-  },
-  {
-    "objectID": "highdim/matrices-in-R.html#dimensions-of-a-matrix",
-    "href": "highdim/matrices-in-R.html#dimensions-of-a-matrix",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.2 Dimensions of a matrix",
-    "text": "19.2 Dimensions of a matrix\nThe dimension of a matrix isan important characteristic needed to assure that certain linear algebra operations can be performed. The dimension, is a two-number summary defined as the number of rows \\(\\times\\) the number of columns.\nThe nrow function tells us how how many rows tha matrix has:\n\nnrow(x)\n#&gt; [1] 60000\n\nand ncol tells us how many columns:\n\nncol(x)\n#&gt; [1] 784\n\nWe learn that our dataset contains 60,000 observations (images) and 784 features (pixels).\nThe dim function returns the rows and columns:\n\ndim(x)\n#&gt; [1] 60000   784"
-  },
-  {
-    "objectID": "highdim/matrices-in-R.html#creating-a-a-matrix",
-    "href": "highdim/matrices-in-R.html#creating-a-a-matrix",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.3 Creating a a matrix",
-    "text": "19.3 Creating a a matrix\nIn R we can create a matrix using the matrix function. The first argument is a vector containing the elements that will fill up the matrix. The second and third arguments determine the number of row and columns, respectively. So a typical way to create a matrix is to first obtain a vector of numbers containing the elements of the matrix and feeding it to the matrix function. For example, to create a \\(100 \\times 2\\) matrix of normally distributed random variables we write:\n\nz &lt;- matrix(rnorm(100*2), 100, 2)\n\nNote that by default the matrix is filled in column by column:\n\nmatrix(1:15, 3, 5)\n#&gt;      [,1] [,2] [,3] [,4] [,5]\n#&gt; [1,]    1    4    7   10   13\n#&gt; [2,]    2    5    8   11   14\n#&gt; [3,]    3    6    9   12   15\n\nTo fill the matrix row by row we can use byrow argument:\n\nmatrix(1:15, 3, 5, byrow = TRUE)\n#&gt;      [,1] [,2] [,3] [,4] [,5]\n#&gt; [1,]    1    2    3    4    5\n#&gt; [2,]    6    7    8    9   10\n#&gt; [3,]   11   12   13   14   15\n\nThe function as.vector converts a matrix back into a vector:\n\nas.vector(matrix(1:15, 3, 5))\n#&gt;  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15\n\n\nIf the product of columns and rows does not match the length of the vector provided in the first argument, matrix recycles values. If the length of the vector is a sub-multiple or multiple of the number of rows this happens without warning:\n\nmatrix(1:3, 3, 5)\n#&gt;      [,1] [,2] [,3] [,4] [,5]\n#&gt; [1,]    1    1    1    1    1\n#&gt; [2,]    2    2    2    2    2\n#&gt; [3,]    3    3    3    3    3"
-  },
-  {
-    "objectID": "highdim/matrices-in-R.html#subsetting",
-    "href": "highdim/matrices-in-R.html#subsetting",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.4 Subsetting",
-    "text": "19.4 Subsetting\nWe can extract a specific entry from matrix, for example the 300th row and 100th column, we use write:\n\nx[300,100]\n\nWe can extract subsets of the matrices by using vectors of indexes. As an example, we can extract the first 100 pixles from the first 300 observations like this: and rows like this:\n\nx[1:300,1:100]\n\nTo extract an entire row or subset of rows, we leave the column dimension blank. So the following code returns all the pixes for the first 300 observations:\n\nx[1:300,]\n\nSimilarly, we can subset any number of columns by keeping the first dimension blank. Here is the code to extract the first 100 pixels:\n\nx[,1:100]\n\n\n\n\n\n\n\nIf we subset just one row or just one column, the resulting object is no longer a matrix. Here is an example:\n\ndim(x[300,])\n#&gt; NULL\n\nTo avoid this we can use the drop argument:\n\ndim(x[100,,drop = FALSE])\n#&gt; [1]   1 784\n\n\n\n\nTask 1: Visualize the original image\nAs an example, let’s try to visualize the third observation. From the label we know this is a:\n\nmnist$train$label[3]\n#&gt; [1] 4\n\nThe third row of the matrix x[3,] contains the 784 pixels intesities. We can assume these were entered in order and convert them back to a \\(28 \\times 28\\) matrix using:\n\ngrid &lt;- matrix(x[3,], 28, 28)\n\nTo visualize the data we can use image, which shows an image of its third argument, with the first two arguments to determine the position on the x and y axes, respectively. Because the top of this plot is pixel 1, which is shown at the bottom, the image is flipped. To code below includes code showing how to flip it back:\n\nimage(1:28, 1:28, grid)\nimage(1:28, 1:28, grid[, 28:1])"
-  },
-  {
-    "objectID": "highdim/matrices-in-R.html#the-transpose",
-    "href": "highdim/matrices-in-R.html#the-transpose",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.6 The transpose",
-    "text": "19.6 The transpose\nA common operation when working with matrices is the transpose. We use the transpose to understand several concepts described in the next several sections. This operation simply converts the rows of a matrix into columns. We use the symbols \\(\\top\\) or \\('\\) next to the bold upper case letter to denote the transpose:\n\\[\n\\text{if } \\,\n\\mathbf{X} =\n\\begin{bmatrix}\n  x_{1,1}&\\dots & x_{1,p} \\\\\n  x_{2,1}&\\dots & x_{2,p} \\\\\n  \\vdots & \\ddots & \\vdots & \\\\\n  x_{n,1}&\\dots & x_{n,p}\n  \\end{bmatrix} \\text{ then }\\,\n\\mathbf{X}^\\top =\n\\begin{bmatrix}\n  x_{1,1}&x_{2,1}&\\dots & x_{n,1} \\\\\n  \\vdots & \\vdots & \\ddots & \\vdots \\\\\n  x_{1,p}&x_{2,p}&\\dots & x_{n,p}\n  \\end{bmatrix}\n\\]\nIn R we compute the transpose using the function t\n\ndim(x)\n#&gt; [1] 60000   784\ndim(t(x))\n#&gt; [1]   784 60000\n\nOne use of the transpose is that we can write the matrix \\(\\mathbf{X}\\) as rows of the column vectors representing the features for each individual observation in the following way:\n\\[\n\\mathbf{X} =\n\\begin{bmatrix}\n\\mathbf{x}_1^\\top\\\\\n\\mathbf{x}_2^\\top\\\\\n\\vdots\\\\\n\\mathbf{x}_n^\\top\n\\end{bmatrix}\n\\]"
-  },
-  {
-    "objectID": "highdim/matrices-in-R.html#conditional-filtering",
-    "href": "highdim/matrices-in-R.html#conditional-filtering",
-    "title": "\n19  Matrices in R\n",
-    "section": "\n19.8 Conditional filtering",
-    "text": "19.8 Conditional filtering\nOne of the advantages of matrices operations over tidyverse operations, is that we can easily select columns based on summaries of the columns.\nNote that logical filters can be used to subset matrices in a similar way in which they can be used to subset vectors. Here is a simple examples subsetting columns with logicals:\n\nmatrix(1:15, 3, 5)[,c(FALSE, TRUE, TRUE, FALSE, TRUE)]\n#&gt;      [,1] [,2] [,3]\n#&gt; [1,]    4    7   13\n#&gt; [2,]    5    8   14\n#&gt; [3,]    6    9   15\n\nThis implies that we can select rows with conditional expression. Here is practical example that removes all observations containing at least one NA:\n\nx[apply(!is.na(x), 1, all),]\n\nThis being a common operation, we have a matrixStats function to do it faster:\n\nx[!rowAnyNAs(x),]\n\nTask 3: Are some pixels uninformative?\nWe can use these ideas to remove columns associated with pixels that don’t change much and thus do not informing digit classification. We will quantify the variation of each pixel with its standard deviation across all entries. Since each column represents a pixel, we use the colSds function from the matrixStats package:\n\nsds &lt;- colSds(x)\n\nA quick look at the distribution of these values shows that some pixels have very low entry to entry variability:\n\n\n\n\n\n\n\n\n\nhist(sds, breaks = 30, main = \"SDs\")\n\nThis makes sense since we don’t write in some parts of the box. Here is the variance plotted by location:\n\nimage(1:28, 1:28, matrix(sds, 28, 28)[, 28:1])\n\n\n\n\n\n\n\n\n\nWe see that there is little variation in the corners.\nWe could remove features that have no variation since these can’t help us predict.\nSo if we wanted to remove uninformative predictors from our matrix, we could write this one line of code:\n\nnew_x &lt;- x[,colSds(x) &gt; 60]\ndim(new_x)\n#&gt; [1] 60000   322\n\nOnly the columns for which the standard deviation is above 60 are kept, which removes over half the predictors."
+    "title": "32  Clustering",
+    "section": "\n32.4 Exercises",
+    "text": "32.4 Exercises\n1. Load the tissue_gene_expression dataset. Remove the row means and compute the distance between each observation. Store the result in d.\n2. Make a hierarchical clustering plot and add the tissue types as labels.\n3. Run a k-means clustering on the data with \\(K=7\\). Make a table comparing the identified clusters to the actual tissue types. Run the algorithm several times to see how the answer changes.\n4. Select the 50 most variable genes. Make sure the observations show up in the columns, that the predictors are centered, and add a color bar to show the different tissue types. Hint: use the ColSideColors argument to assign colors. Also, use col = RColorBrewer::brewer.pal(11, \"RdBu\") for a better use of colors."
   }
 ]
\ No newline at end of file
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 553d45a..9ecd022 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -2,158 +2,166 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/index.html</loc>
-    <lastmod>2023-08-20T12:47:24.698Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.302Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/intro.html</loc>
-    <lastmod>2023-08-20T12:47:24.703Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.308Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/summaries/intro-summaries.html</loc>
-    <lastmod>2023-08-20T12:47:24.706Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.313Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/summaries/distributions.html</loc>
-    <lastmod>2023-08-20T12:47:24.721Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.337Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/summaries/robust-summaries.html</loc>
-    <lastmod>2023-08-20T12:47:24.730Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.346Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/prob/intro-to-prob.html</loc>
-    <lastmod>2023-08-20T12:47:24.734Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.349Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/prob/discrete-probability.html</loc>
-    <lastmod>2023-08-20T12:47:24.757Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.370Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/prob/continuous-probability.html</loc>
-    <lastmod>2023-08-20T12:47:24.768Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.381Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/prob/random-variables-sampling-models-clt.html</loc>
-    <lastmod>2023-08-20T12:47:24.783Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.394Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/inference/intro-inference.html</loc>
-    <lastmod>2023-08-20T12:47:24.787Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.398Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/inference/parameters-estimates.html</loc>
-    <lastmod>2023-08-20T12:47:24.794Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.407Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/inference/clt.html</loc>
-    <lastmod>2023-08-20T12:47:24.803Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.414Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/inference/confidence-intervals.html</loc>
-    <lastmod>2023-08-20T12:47:24.811Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.424Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/inference/hypothesis-testing.html</loc>
-    <lastmod>2023-08-20T12:47:24.817Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.429Z</lastmod>
+  </url>
+  <url>
+    <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/inference/bootstrap.html</loc>
+    <lastmod>2024-01-04T13:50:50.437Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/inference/models.html</loc>
-    <lastmod>2023-08-20T12:47:24.844Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.450Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/inference/bayes.html</loc>
-    <lastmod>2023-08-20T12:47:24.852Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.459Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/inference/hierarchical-models.html</loc>
-    <lastmod>2023-08-20T12:47:24.879Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.489Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/linear-models/intro-to-linear-models.html</loc>
-    <lastmod>2023-08-20T12:47:24.877Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.493Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/linear-models/regression.html</loc>
-    <lastmod>2023-08-20T12:47:24.903Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.515Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/linear-models/multivariate-regression.html</loc>
-    <lastmod>2023-11-11T18:47:28.944Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.539Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/linear-models/measurement-error-models.html</loc>
-    <lastmod>2023-11-11T18:47:37.687Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.547Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/linear-models/treatment-effect-models.html</loc>
-    <lastmod>2023-08-20T12:47:24.949Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.561Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/linear-models/association-tests.html</loc>
-    <lastmod>2023-08-20T12:47:24.964Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.575Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/linear-models/association-not-causation.html</loc>
-    <lastmod>2023-11-11T18:47:15.400Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.588Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/highdim/intro-highdim.html</loc>
-    <lastmod>2023-08-20T12:47:24.984Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.592Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/highdim/matrices-in-R.html</loc>
-    <lastmod>2023-11-11T18:46:56.947Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.607Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/highdim/linear-algebra.html</loc>
-    <lastmod>2023-11-11T18:46:42.053Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.618Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/highdim/dimension-reduction.html</loc>
-    <lastmod>2023-11-11T18:46:30.930Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.633Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/highdim/regularization.html</loc>
-    <lastmod>2023-08-20T12:47:25.043Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.652Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/highdim/matrix-factorization.html</loc>
-    <lastmod>2023-08-20T12:47:25.061Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.674Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/ml/intro-ml.html</loc>
-    <lastmod>2023-08-20T12:47:25.065Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.677Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/ml/notation-and-terminology.html</loc>
-    <lastmod>2023-08-20T12:47:25.071Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.682Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/ml/evaluation-metrics.html</loc>
-    <lastmod>2023-08-20T12:47:25.088Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.698Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/ml/conditionals.html</loc>
-    <lastmod>2023-08-20T12:47:25.093Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.703Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/ml/smoothing.html</loc>
-    <lastmod>2023-11-11T18:47:52.266Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.716Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/ml/cross-validation.html</loc>
-    <lastmod>2023-08-20T12:47:25.129Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.729Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/ml/algorithms.html</loc>
-    <lastmod>2023-08-20T12:47:25.211Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.755Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/ml/ml-in-practice.html</loc>
-    <lastmod>2023-08-20T12:47:25.231Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.777Z</lastmod>
   </url>
   <url>
     <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/ml/clustering.html</loc>
-    <lastmod>2023-08-20T12:47:25.239Z</lastmod>
+    <lastmod>2024-01-04T13:50:50.787Z</lastmod>
+  </url>
+  <url>
+    <loc>http://rafalab.dfci.harvard.edu/dsbook-part-2/Advanced-Data-Science.pdf</loc>
+    <lastmod>2024-01-04T13:50:50.112Z</lastmod>
   </url>
 </urlset>
diff --git a/docs/summaries/distributions.html b/docs/summaries/distributions.html
index 4d5aa10..bc18931 100644
--- a/docs/summaries/distributions.html
+++ b/docs/summaries/distributions.html
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -427,18 +433,18 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
   </div>
   
 
-</header><p>To illustrate the concepts needed to understand distribution and how they relate to summary statistics, we will pretend that we have to describe the heights of our classmates to ET, an extraterrestrial that has never seen humans. As a first step, we need to collect data. To do this, we ask students to report their heights in inches. We ask them to provide sex information because we know there are two different distributions by sex. We collect the data and save it in the <code>heights</code> data frame included in the <strong>dslabs</strong> pacakge.</p>
+</header><p>To illustrate the concepts needed to understand distribution and how they relate to summary statistics, we will pretend that we have to describe the heights of our classmates to ET, an extraterrestrial that has never seen humans. As a first step, we need to collect data. To do this, we ask students to report their heights in inches. We ask them to provide sex information because we know there are two different distributions by sex. We collect the data and save it in the <code>heights</code> data frame included in the <strong>dslabs</strong> package.</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/unnamed-chunk-2_3a5d0fe66f97d77e007932e9e2f7a79f">
 <div class="sourceCode" id="cb1"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>One way to convey the heights to ET is to simply send him this list of 1050 heights. But there are much more effective ways to convey this information, and understanding the concept of a distribution will help. To simplify the explanation, we first focus on male heights. We examine the female height data in Section <a href="#sec-stratification"><span>Section&nbsp;1.10</span></a>.</p>
-<p>It turns out that, in some cases, the average and the standard deviation are all we need to understand the data. We will learn data visualization techniques that will help us determine when this two number summary is appropriate. These same techniques will serve as an alternative for when two numbers are not enough.</p>
+<p>One way to convey the heights to ET is to simply send him this list of 1050 heights. However, there are much more effective ways to convey this information, and understanding the concept of a distribution will help. To simplify the explanation, we first focus on male heights. We examine the female height data in <a href="#sec-stratification"><span>Section&nbsp;1.10</span></a>.</p>
+<p>It turns out that, in some cases, the average and the standard deviation are all we need to understand the data. We will learn data visualization techniques that will help us determine when this two-number summary is appropriate. These same techniques will serve as an alternative for when two numbers are not enough.</p>
 <section id="variable-types" class="level2" data-number="1.1"><h2 data-number="1.1" class="anchored" data-anchor-id="variable-types">
 <span class="header-section-number">1.1</span> Variable types</h2>
 <p>We will be working with two types of variables: categorical and numeric. Each can be divided into two other groups: categorical can be ordinal or not, whereas numerical variables can be discrete or continuous.</p>
-<p>When each entry in a vector comes from one of a small number of groups, we refer to the data as <em>categorical data</em>. Two simple examples are sex (male or female) and US regions (Northeast, South, North Central, West). Some categorical data can be ordered even if they are not numbers, such as spiciness (mild, medium, hot). In statistics textbooks, ordered categorical data are referred to as <em>ordinal</em> data.</p>
-<p>Examples of numerical data are population sizes, murder rates, and heights. Some numerical data can be treated as ordered categorical. We can further divide numerical data into continuous and discrete. Continuous variables are those that can take any value, such as heights, if measured with enough precision. For example, a pair of twins may be 68.12 and 68.11 inches, respectively. Counts, such as number of gun murders per year, are discrete because they have to be round numbers.</p>
-<p>Keep in mind that discrete numeric data can be considered ordinal. Although this is technically true, we usually reserve the term ordinal data for variables belonging to a small number of different groups, with each group having many members. In contrast, when we have many groups with few cases in each group, we typically refer to them as discrete numerical variables. So, for example, the number of packs of cigarettes a person smokes a day, rounded to the closest pack, would be considered ordinal, while the actual number of cigarettes would be considered a numerical variable. But, indeed, there are examples that can be considered both numerical and ordinal.</p>
+<p>When each entry in a vector comes from one of a small number of groups, we refer to the data as <em>categorical data</em>. Two simple examples are sex (male or female) and US regions (Northeast, South, North Central, West). Some categorical data can be ordered even if they are not numbers, such as spiciness (mild, medium, hot). In statistics textbooks, these ordered categorical data are referred to as <em>ordinal</em> data.</p>
+<p>Examples of numerical data are population sizes, murder rates, and heights. Some numerical data can be treated as ordered categorical. We can further divide numerical data into continuous and discrete. Continuous variables are those that can take any value, such as heights, if measured with enough precision. For example, a pair of twins may be 68.12 and 68.11 inches, respectively. Counts, such as number of gun murders per year, are discrete because they must be round numbers.</p>
+<p>Keep in mind that discrete numeric data can be considered ordinal. Although this is technically true, we usually reserve the term ordinal data for variables belonging to a small number of different groups, with each group having many members. In contrast, when we have many groups with few cases in each group, we typically refer to them as discrete numerical variables. So, for example, the number of packs of cigarettes a person smokes a day, rounded to the closest pack, would be considered ordinal, while the actual number of cigarettes would be considered a numerical variable. However, there are indeed examples that can be considered both numerical and ordinal.</p>
 <p>The most basic statistical summary of a list of objects or numbers is its <em>distribution</em>. The simplest way to think of a distribution is as a compact description of a list with many entries. This concept should not be new for readers of this book. For example, with categorical data, the distribution simply describes the proportion of each unique category. Here is an example with US state regions:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/summaries-state-region-distribution_205713eef20adfa3177f9a5d0e3c6b00">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/proportions.html">prop.table</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/table.html">table</a></span><span class="op">(</span><span class="va">state.region</span><span class="op">)</span><span class="op">)</span></span>
@@ -449,7 +455,7 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 <p>When the data is numerical, the task of constructing a summary based on the distribution is more challenging. We introduce an artificial, yet illustrative, motivating problem that will help us introduce the concepts needed to understand distributions.</p>
 </section><section id="sec-ecdf-intro" class="level2" data-number="1.2"><h2 data-number="1.2" class="anchored" data-anchor-id="sec-ecdf-intro">
 <span class="header-section-number">1.2</span> Empirical cumulative distribution functions</h2>
-<p>Numerical data that are not categorical also have distributions. In general, when data is not categorical, reporting the frequency of each entry is not an effective summary since most entries are unique. In our case study, while several students reported a height of 68 inches, only one student reported a height of <code>68.503937007874</code> inches and only one student reported a height <code>68.8976377952756</code> inches. We assume that they converted from 174 and 175 centimeters, respectively.</p>
+<p>Numerical data that are not categorical also have distributions. In general, when data is not categorical, reporting the frequency of each entry is not an effective summary, as most entries are unique. In our case study, while several students reported a height of 68 inches, only one student reported a height of <code>68.503937007874</code> inches and only one student reported a height <code>68.8976377952756</code> inches. We assume that they converted from 174 and 175 centimeters, respectively.</p>
 <p>Statistics textbooks teach us that a more useful way to define a distribution for numeric data is to define a function that reports the proportion of the data entries <span class="math inline">\(x\)</span> that are below <span class="math inline">\(a\)</span>, for all possible values of <span class="math inline">\(a\)</span>. This function is called the empirical cumulative distribution function (eCDF) and often denoted with <span class="math inline">\(F\)</span>:</p>
 <p><span class="math display">\[ F(a) = \mbox{Proportion of data points that are less than or equal to }a\]</span></p>
 <p>Here is a plot of <span class="math inline">\(F\)</span> for the male height data:</p>
@@ -467,10 +473,10 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 </div>
 </div>
 <p>Similar to what the frequency table does for categorical data, the eCDF defines the distribution for numerical data. From the plot, we can see that 16% of the values are below 65, since <span class="math inline">\(F(66)=\)</span> 0.1637931, or that 84% of the values are below 72, since <span class="math inline">\(F(72)=\)</span> 0.841133, and so on. In fact, we can report the proportion of values between any two heights, say <span class="math inline">\(a\)</span> and <span class="math inline">\(b\)</span>, by computing <span class="math inline">\(F(b) - F(a)\)</span>. This means that if we send this plot above to ET, he will have all the information needed to reconstruct the entire list. Paraphrasing the expression “a picture is worth a thousand words”, in this case, a picture is as informative as 812 numbers.</p>
-<p>Note: the reason we add the word <em>empirical</em> is because, as we will see in <a href="../prob/continuous-probability.html#sec-cdf-intro"><span>Section&nbsp;4.1</span></a>), the cumulative distribution function (CDF can be defined mathematically, meaning without any data.</p>
+<p>Note: the reason we add the word <em>empirical</em> is because, as we will see in <a href="../prob/continuous-probability.html#sec-cdf-intro"><span>Section&nbsp;4.1</span></a>), the cumulative distribution function (CDF) can be defined mathematically, meaning without any data.</p>
 </section><section id="histograms" class="level2" data-number="1.3"><h2 data-number="1.3" class="anchored" data-anchor-id="histograms">
 <span class="header-section-number">1.3</span> Histograms</h2>
-<p>Although the eCDF concept is widely discussed in statistics textbooks, the summary plot is actually not very popular in practice. The main reason is that it does not easily convey characteristics of interest such as: at what value is the distribution centered? Is the distribution symmetric? What ranges contain 95% of the values? Histograms are much preferred because they greatly facilitate answering such questions. Histograms sacrifice just a bit of information to produce summaries that are much easier to interpret.</p>
+<p>Although the eCDF concept is widely discussed in statistics textbooks, the summary plot is actually not very popular in practice. The main reason is that it does not easily convey characteristics of interest, such as at what value is the distribution centered, whether the distribution symmetric, or which ranges contain 95% of the values. Histograms, on the other hand, are much preferred because they greatly facilitate answering such questions. Histograms sacrifice just a bit of information to produce summaries that are much easier to interpret.</p>
 <p>The simplest way to make a histogram is to divide the span of our data into non-overlapping bins of the same size. Then, for each bin, we count the number of values that fall in that interval. The histogram plots these counts as bars with the base of the bar defined by the intervals. Here is the histogram for the height data splitting the range of values into one inch intervals: <span class="math inline">\((49.5, 50.5],(50.5, 51.5],(51.5,52.5],(52.5,53.5],...,(82.5,83.5]\)</span></p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/summaries-height-histogram_ff33642f16b6074bb867bd09a3dc118f">
 <div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">sex</span> <span class="op">==</span> <span class="st">"Male"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
@@ -485,7 +491,7 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 </div>
 <p>As you can see in the figure above, a histogram is similar to a barplot, but it differs in that the x-axis is numerical, not categorical.</p>
 <p>If we send this plot to ET, he will immediately learn some important properties about our data. First, the range of the data is from 50 to 84 with the majority (more than 95%) between 63 and 75 inches. Second, the heights are close to symmetric around 69 inches. Also, by adding up counts, ET could obtain a very good approximation of the proportion of the data in any interval. Therefore, the histogram above is not only easy to interpret, but also provides almost all the information contained in the raw list of 812 heights with about 30 bin counts.</p>
-<p>What information do we lose? Note that all values in each interval are treated the same when computing bin heights. So, for example, the histogram does not distinguish between 64, 64.1, and 64.2 inches. Given that these differences are almost unnoticeable to the eye, the practical implications are negligible and we were able to summarize the data to just 23 numbers.</p>
+<p>What information do we lose? Notice that all values in each interval are treated the same when computing bin heights. So, for example, the histogram does not distinguish between 64, 64.1, and 64.2 inches. Given that these differences are almost unnoticeable to the eye, the practical implications are negligible and we were able to summarize the data to just 23 numbers.</p>
 </section><section id="smoothed-density" class="level2" data-number="1.4"><h2 data-number="1.4" class="anchored" data-anchor-id="smoothed-density">
 <span class="header-section-number">1.4</span> Smoothed density</h2>
 <p>Smooth density plots are similar to histograms, but the data is not divided into bins. Here is what a smooth density plot looks like for our heights data:</p>
@@ -503,8 +509,8 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 </div>
 <p>In this plot, we no longer have sharp edges at the interval boundaries and many of the local peaks have been removed. Also, the scale of the y-axis changed from counts to <em>density</em>.</p>
 <p>To understand the smooth densities, we have to understand <em>estimates</em>, a topic we don’t cover until later. However, we provide a heuristic explanation to help you understand the basics.</p>
-<p>The main new concept you must understand is that we assume that our list of observed values is a subset of a much larger list of unobserved values. In the case of heights, you can imagine that our list of 812 male students comes from a hypothetical list containing all the heights of all the male students in all the world measured very precisely. Let’s say there are 1,000,000 of these measurements. This list of values has a distribution, like any list of values, and this larger distribution is really what we want to report to ET since it is much more general. Unfortunately, we don’t get to see it.</p>
-<p>However, we make an assumption that helps us perhaps approximate it. If we had 1,000,000 values, measured very precisely, we could make a histogram with very, very small bins. The assumption is that if we show this, the height of consecutive bins will be similar. This is what we mean by smooth: we don’t have big jumps in the heights of consecutive bins. Below we have a hypothetical histogram with bins of size 1:</p>
+<p>The main new concept you must understand is that we assume that our list of observed values is a subset of a much larger list of unobserved values. In the case of heights, you can imagine that our list of 812 male students comes from a hypothetical list containing all the heights of all the male students in all the world measured very precisely. Let’s say there are 1,000,000 of these measurements. This list of values has a distribution, like any other list of values, and what we truly want to report to ET is this larger distribution, as it is much more general. . Unfortunately, we don’t get to see it.</p>
+<p>However, we make an assumption that helps us perhaps approximate it. If we had 1,000,000 values, measured very precisely, we could make a histogram with very, very small bins. The assumption is that if we show this, the height of consecutive bins will be similar. This is what we mean by smooth: we don’t have big jumps in the heights of consecutive bins. Below, we present a hypothetical histogram with bins of size 1:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/summaries--simulated-data-histogram-1_79776ce6f418fa384c1411529f3d4fed">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -513,7 +519,7 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>The smaller we make the bins, the smoother the histogram gets. Here are the histograms with bin width of 1, 0.5, and 0.1:</p>
+<p>The smaller we make the bins, the smoother the histogram becomes. Below are the histograms with bin width of 1, 0.5, and 0.1:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/summaries-simulated-data-histogram-2_544d555e59104d348a79ff4c0e21db2b">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -532,7 +538,7 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 </div>
 </div>
 <p>Now, back to reality. We don’t have millions of measurements. Instead, we have 812 and we can’t make a histogram with very small bins.</p>
-<p>We therefore make a histogram, using bin sizes appropriate for our data and computing frequencies rather than counts, and we draw a smooth curve that goes through the tops of the histogram bars. The following plots demonstrate the steps that lead to a smooth density:</p>
+<p>Therefore, we make a histogram using bin sizes appropriate for our data, computing frequencies rather than counts. Additionally, we draw a smooth curve that passes through the tops of the histogram bars. The following plots demonstrate the steps that lead to a smooth density:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/summaries-smooth-density-2_7caf82d6367f10fc199006208b781688">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -564,9 +570,9 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>We need to make this choice with care as the resulting summary can change our interpretation of the data. We should select a degree of smoothness that we can defend as being representative of the underlying data. In the case of height, we really do have reason to believe that the proportion of people with similar heights should be the same. For example, the proportion that is 72 inches should be more similar to the proportion that is 71 than to the proportion that is 78 or 65. This implies that the curve should be pretty smooth; that is, the curve should look more like the example on the right than on the left.</p>
+<p>We need to make this choice with care as the resulting summary can change our interpretation of the data. We should select a degree of smoothness that we can defend as being representative of the underlying data. In the case of height, we really do have reason to believe that the proportion of people with similar heights should be the same. For example, the proportion that is 72 inches should be more similar to the proportion that is 71 than to the proportion that is 78 or 65. This implies that the curve should be relatively smooth, resembling the example on the right more than the one on the left.</p>
 <p>While the histogram is an assumption-free summary, the smoothed density is based on some assumptions.</p>
-<p>Note that interpreting the y-axis of a smooth density plot is not straightforward. It is scaled so that the area under the density curve adds up to 1. If you imagine we form a bin with a base 1 unit in length, the y-axis value tells us the proportion of values in that bin. However, this is only true for bins of size 1. For other size intervals, the best way to determine the proportion of data in that interval is by computing the proportion of the total area contained in that interval. For example, here are the proportion of values between 65 and 68:</p>
+<p>Note that interpreting the y-axis of a smooth density plot is not straightforward. It is scaled so that the area under the density curve adds up to 1. If you imagine that we form a bin with a base 1 unit in length, the y-axis value tells us the proportion of values in that bin. However, this is only true for bins of size 1. For other size intervals, the best way to determine the proportion of data in that interval is by computing the proportion of the total area contained in that interval. For example, here are the proportion of values between 65 and 68:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/summaries-area-under-curve_d6d5bb4c11550e84ff21ad8593b94e87">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -591,17 +597,17 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>With the material covered up to here, you can do exercises 1 through 10.</p>
+<p>With the material covered up to this point, you can complete exercises 1 through 10.</p>
 </div>
 </div>
 </div>
 </section><section id="sec-normal-distribution" class="level2" data-number="1.5"><h2 data-number="1.5" class="anchored" data-anchor-id="sec-normal-distribution">
 <span class="header-section-number">1.5</span> The normal distribution</h2>
 <p>Histograms and density plots provide excellent summaries of a distribution. But can we summarize even further? We often see the average and standard deviation used as summary statistics: a two-number summary! To understand what these summaries are and why they are so widely used, we need to understand the normal distribution.</p>
-<p>The normal distribution, also known as the bell curve and as the Gaussian distribution, is one of the most famous mathematical concepts in history. A reason for this is that approximately normal distributions occur in many situations, including gambling winnings, heights, weights, blood pressure, standardized test scores, and experimental measurement errors. There are explanations for this, but we describe these later. Here we focus on how the normal distribution helps us summarize data.</p>
+<p>The normal distribution, also known as the bell curve and as the Gaussian distribution, is one of the most famous mathematical concepts in history. One reason for this is that approximately normal distributions occur in many situations, including gambling winnings, heights, weights, blood pressure, standardized test scores, and experimental measurement errors. There are explanations for these occurrences, which we will describe later. Here we focus on how the normal distribution helps us summarize data.</p>
 <p>Rather than using data, the normal distribution is defined with a mathematical formula. For any interval <span class="math inline">\((a,b)\)</span>, the proportion of values in that interval can be computed using this formula:</p>
 <p><span class="math display">\[\mbox{Pr}(a &lt; x \leq b) = \int_a^b \frac{1}{\sqrt{2\pi}\sigma} e^{-\frac{1}{2}\left( \frac{x-\mu}{\sigma} \right)^2} \, dx\]</span></p>
-<p>You don’t need to memorize or understand the details of the formula. But note that it is completely defined by just two parameters: <span class="math inline">\(\mu\)</span> and <span class="math inline">\(\sigma\)</span>. The rest of the symbols in the formula represent the interval ends, <span class="math inline">\(a\)</span> and <span class="math inline">\(b\)</span>, and known mathematical constants <span class="math inline">\(\pi\)</span> and <span class="math inline">\(e\)</span>. These two parameters, <span class="math inline">\(\mu\)</span> and <span class="math inline">\(\sigma\)</span>, are referred to as the <em>average</em> (also called the <em>mean</em>) and the <em>standard deviation</em> (SD) of the distribution, respectively (and are the greek letters for <span class="math inline">\(m\)</span> and <span class="math inline">\(s\)</span>).</p>
+<p>You don’t need to memorize or understand the details of the formula. However, it is important to note that it is completely defined by just two parameters: <span class="math inline">\(\mu\)</span> and <span class="math inline">\(\sigma\)</span>. The rest of the symbols in the formula represent the interval ends, <span class="math inline">\(a\)</span> and <span class="math inline">\(b\)</span>, and known mathematical constants <span class="math inline">\(\pi\)</span> and <span class="math inline">\(e\)</span>. These two parameters, <span class="math inline">\(\mu\)</span> and <span class="math inline">\(\sigma\)</span>, are referred to as the <em>average</em> (also called the <em>mean</em>) and the <em>standard deviation</em> (SD) of the distribution, respectively (and are the Greek letters for <span class="math inline">\(m\)</span> and <span class="math inline">\(s\)</span>).</p>
 <p>The distribution is symmetric, centered at the average, and most values (about 95%) are within 2 SDs from the average. Here is what the normal distribution looks like when the average is 0 and the SD is 1:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/summaries-normal-distribution-density_965d68a14d39e82477b8ef90a0ed1fcc">
 <div class="cell-output-display">
@@ -637,7 +643,7 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 <i class="callout-icon"></i>
 </div>
 <div class="callout-body-container">
-<p>For reasons explained in Section <a href="../inference/models.html#sec-population-sd"><span>Section&nbsp;10.2.1</span></a>, <code>sd(x)</code> divides by <code>length(x)-1</code> rather than <code>length(x)</code>. But note that when <code>length(x)</code> is large, <code>sd(x)</code> and <code>sqrt(sum((x-mu)^2) / length(x))</code> are practically equal.</p>
+<p>For reasons explained in <a href="../inference/models.html#sec-population-sd"><span>Section&nbsp;11.2.1</span></a>, <code>sd(x)</code> divides by <code>length(x)-1</code> rather than <code>length(x)</code>. But note that when <code>length(x)</code> is large, <code>sd(x)</code> and <code>sqrt(sum((x-mu)^2) / length(x))</code> are practically equal.</p>
 </div>
 </div>
 </div>
@@ -650,30 +656,30 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>The normal distribution does appear to be quite a good approximation here. We now will see how well this approximation works at predicting the proportion of values within intervals.</p>
+<p>The normal distribution does appear to be quite a good approximation here. We will now see how well this approximation works at predicting the proportion of values within intervals.</p>
 </section><section id="standard-units" class="level2" data-number="1.6"><h2 data-number="1.6" class="anchored" data-anchor-id="standard-units">
 <span class="header-section-number">1.6</span> Standard units</h2>
 <p>For data that is approximately normally distributed, it is convenient to think in terms of <em>standard units</em>. The standard unit of a value tells us how many standard deviations away from the average it is. Specifically, for a value <code>x</code> from a vector <code>X</code>, we define the value of <code>x</code> in standard units as <code>z = (x - m)/s</code> with <code>m</code> and <code>s</code> the average and standard deviation of <code>X</code>, respectively. Why is this convenient?</p>
-<p>First look back at the formula for the normal distribution and note that what is being exponentiated is <span class="math inline">\(-z^2/2\)</span> with <span class="math inline">\(z\)</span> equivalent to <span class="math inline">\(x\)</span> in standard units. Because the maximum of <span class="math inline">\(e^{-z^2/2}\)</span> is when <span class="math inline">\(z = 0\)</span>, this explains why the maximum of the distribution occurs at the average. It also explains the symmetry since <span class="math inline">\(- z^2/2\)</span> is symmetric around 0. Second, note that if we convert the normally distributed data to standard units, we can quickly know if, for example, a person is about average (<span class="math inline">\(z = 0\)</span>), one of the largest (<span class="math inline">\(z \approx 2\)</span>), one of the smallest (<span class="math inline">\(z \approx -2\)</span>), or an extremely rare occurrence (<span class="math inline">\(z &gt; 3\)</span> or <span class="math inline">\(z &lt; -3\)</span>). Remember that it does not matter what the original units are, these rules apply to any data that is approximately normal.</p>
+<p>First, revisit the formula for the normal distribution and observe that what is being exponentiated is <span class="math inline">\(-z^2/2\)</span> with <span class="math inline">\(z\)</span> equivalent to <span class="math inline">\(x\)</span> in standard units. Because the maximum of <span class="math inline">\(e^{-z^2/2}\)</span> is when <span class="math inline">\(z = 0\)</span>, this explains why the maximum of the distribution occurs at the average. It also explains the symmetry since <span class="math inline">\(- z^2/2\)</span> is symmetric around 0. Secondly, note that by converting the normally distributed data to standard units, we can quickly ascertain whether, for example, a person is about average (<span class="math inline">\(z = 0\)</span>), one of the largest (<span class="math inline">\(z \approx 2\)</span>), one of the smallest (<span class="math inline">\(z \approx -2\)</span>), or an extremely rare occurrence (<span class="math inline">\(z &gt; 3\)</span> or <span class="math inline">\(z &lt; -3\)</span>). Remember that it does not matter what the original units are, these rules apply to any data that is approximately normal.</p>
 <p>In R, we can obtain standard units using the function <code>scale</code>:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/unnamed-chunk-7_6d4a9d4eb9446d52e2e9d6cb4b661f9a">
 <div class="sourceCode" id="cb11"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">z</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/scale.html">scale</a></span><span class="op">(</span><span class="va">x</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Now to see how many men are within 2 SDs from the average, we simply type:</p>
+<p>To see how many men are within 2 SDs from the average, we simply type:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/unnamed-chunk-8_0887935c3474ad07d738631065896392">
 <div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/MathFun.html">abs</a></span><span class="op">(</span><span class="va">z</span><span class="op">)</span> <span class="op">&lt;</span> <span class="fl">2</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.95</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The proportion is about 95%, which is what the normal distribution predicts! To further confirm that, in fact, the approximation is a good one, we can use quantile-quantile plots.</p>
+<p>The proportion is about 95%, which is what the normal distribution predicts! To further validate this approximation, we can use quantile-quantile plots.</p>
 </section><section id="quantile-quantile-plots" class="level2" data-number="1.7"><h2 data-number="1.7" class="anchored" data-anchor-id="quantile-quantile-plots">
 <span class="header-section-number">1.7</span> Quantile-quantile plots</h2>
-<p>A systematic way to assess how well the normal distribution fits the data is to check if the observed and predicted proportions match. In general, this is the approach of the quantile-quantile plot (QQ-plot).</p>
-<p>First let’s define the theoretical quantiles for the normal distribution. In statistics books we use the symbol <span class="math inline">\(\Phi(x)\)</span> to define the function that gives us the proportion of a standard normal distributed data that are smaller than <span class="math inline">\(x\)</span>. So, for example, <span class="math inline">\(\Phi(-1.96) = 0.025\)</span> and <span class="math inline">\(\Phi(1.96) = 0.975\)</span>. In R, we can evaluate <span class="math inline">\(\Phi\)</span> using the <code>pnorm</code> function:</p>
+<p>A systematic way to assess how well the normal distribution fits the data is to check if the observed and predicted proportions match. In general, this is the approach of the quantile-quantile plot (qqplot).</p>
+<p>First, let’s define the theoretical quantiles for the normal distribution. In statistics books, we use the symbol <span class="math inline">\(\Phi(x)\)</span> to define the function that gives us the proportion of a standard normal distributed data that are smaller than <span class="math inline">\(x\)</span>. So, for example, <span class="math inline">\(\Phi(-1.96) = 0.025\)</span> and <span class="math inline">\(\Phi(1.96) = 0.975\)</span>. In R, we can evaluate <span class="math inline">\(\Phi\)</span> using the <code>pnorm</code> function:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/unnamed-chunk-9_8f44e3c0c97fd23a8d7c15dca1b19eb9">
 <div class="sourceCode" id="cb13"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">pnorm</a></span><span class="op">(</span><span class="op">-</span><span class="fl">1.96</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.025</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The inverse function <span class="math inline">\(\Phi^{-1}(x)\)</span> gives us the <em>theoretical quantiles</em> for the normal distribution. So, for example, <span class="math inline">\(\Phi^{-1}(0.975) = 1.96\)</span>. In R, we can evaluate the inverse of <span class="math inline">\(\Phi\)</span> using the <code>qnorm</code> function.</p>
+<p>The inverse function <span class="math inline">\(\Phi^{-1}(x)\)</span> gives us the <em>theoretical quantiles</em> for the normal distribution. Thus, for instance, <span class="math inline">\(\Phi^{-1}(0.975) = 1.96\)</span>. In R, we can evaluate the inverse of <span class="math inline">\(\Phi\)</span> using the <code>qnorm</code> function.</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/unnamed-chunk-10_4216a6a09c985662d3fef62197db2cbe">
 <div class="sourceCode" id="cb14"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">qnorm</a></span><span class="op">(</span><span class="fl">0.975</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 1.96</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -683,21 +689,21 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 <div class="sourceCode" id="cb15"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">qnorm</a></span><span class="op">(</span><span class="fl">0.975</span>, mean <span class="op">=</span> <span class="fl">5</span>, sd <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 8.92</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>For the normal distribution, all the calculations related to quantiles are done without data, thus the name <em>theoretical quantiles</em>. But quantiles can be defined for any distribution, including an empirical one. So if we have data in a vector <span class="math inline">\(x\)</span>, we can define the quantile associated with any proportion <span class="math inline">\(p\)</span> as the <span class="math inline">\(q\)</span> for which the proportion of values below <span class="math inline">\(q\)</span> is <span class="math inline">\(p\)</span>. Using R code, we can define <code>q</code> as the value for which <code>mean(x &lt;= q) = p</code>. Notice that not all <span class="math inline">\(p\)</span> have a <span class="math inline">\(q\)</span> for which the proportion is exactly <span class="math inline">\(p\)</span>. There are several ways of defining the best <span class="math inline">\(q\)</span> as discussed in the help for the <code>quantile</code> function.</p>
+<p>For the normal distribution, all the calculations related to quantiles are done without data, hence the name <em>theoretical quantiles</em>. But quantiles can be defined for any distribution, including an empirical one. If we therefore have data in a vector <span class="math inline">\(x\)</span>, we can define the quantile associated with any proportion <span class="math inline">\(p\)</span> as the <span class="math inline">\(q\)</span> for which the proportion of values below <span class="math inline">\(q\)</span> is <span class="math inline">\(p\)</span>. Using R code, we can define <code>q</code> as the value for which <code>mean(x &lt;= q) = p</code>. Notice that not all <span class="math inline">\(p\)</span> have a <span class="math inline">\(q\)</span> for which the proportion is exactly <span class="math inline">\(p\)</span>. There are several ways of defining the best <span class="math inline">\(q\)</span> as discussed in the help for the <code>quantile</code> function.</p>
 <p>To give a quick example, for the male heights data, we have that:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/unnamed-chunk-12_c9def4410d8bbecac98b1c590bf22ab6">
 <div class="sourceCode" id="cb16"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">x</span> <span class="op">&lt;=</span> <span class="fl">69.5</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 0.515</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>So about 50% are shorter or equal to 69 inches. This implies that if <span class="math inline">\(p = 0.50\)</span> then <span class="math inline">\(q = 69.5\)</span>.</p>
-<p>The idea of a QQ-plot is that if your data is well approximated by normal distribution then the quantiles of your data should be similar to the quantiles of a normal distribution. To construct a QQ-plot, we do the following:</p>
+<p>Therefore about 50% are shorter or equal to 69 inches. This implies that if <span class="math inline">\(p = 0.50\)</span>, then <span class="math inline">\(q = 69.5\)</span>.</p>
+<p>The idea of a qqplot is that if your data is well approximated by normal distribution, then the quantiles of your data should be similar to the quantiles of a normal distribution. To construct a qqplot, we do the following:</p>
 <ol type="1">
 <li>Define a vector of <span class="math inline">\(m\)</span> proportions <span class="math inline">\(p_1, p_2, \dots, p_m\)</span>.</li>
 <li>Define a vector of quantiles <span class="math inline">\(q_1, \dots, q_m\)</span> for your data for the proportions <span class="math inline">\(p_1, \dots, p_m\)</span>. We refer to these as the <em>sample quantiles</em>.</li>
 <li>Define a vector of theoretical quantiles for the proportions <span class="math inline">\(p_1, \dots, p_m\)</span> for a normal distribution with the same average and standard deviation as the data.</li>
 <li>Plot the sample quantiles versus the theoretical quantiles.</li>
 </ol>
-<p>Let’s construct a QQ-plot using R code. Start by defining the vector of proportions.</p>
+<p>Let’s construct a qqplot using R code. Start by defining the vector of proportions.</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/unnamed-chunk-13_de98126e5d43a29958089828782f2f17">
 <div class="sourceCode" id="cb17"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">p</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0.05</span>, <span class="fl">0.95</span>, <span class="fl">0.05</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
@@ -726,24 +732,24 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 <span><span class="va">theoretical_quantiles</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/stats/Normal.html">qnorm</a></span><span class="op">(</span><span class="va">p</span><span class="op">)</span> </span>
 <span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/qplot.html">qplot</a></span><span class="op">(</span><span class="va">theoretical_quantiles</span>, <span class="va">sample_quantiles</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_abline.html">geom_abline</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>The above code is included to help describe QQ-plots. However, in practice it is easier to use <strong>ggplot2</strong> code:</p>
+<p>The above code is included to help describe qqplots. However, in practice it is easier to use <strong>ggplot2</strong> code:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/unnamed-chunk-16_0a79acf91c0b849662c6a5c89f6457a6">
 <div class="sourceCode" id="cb22"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">sex</span> <span class="op">==</span> <span class="st">"Male"</span><span class="op">)</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span>sample <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/scale.html">scale</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_qq.html">geom_qq</a></span><span class="op">(</span><span class="op">)</span> <span class="op">+</span></span>
 <span>  <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_abline.html">geom_abline</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>While for the illustration above we used 20 quantiles, the default from the <code>geom_qq</code> function is to use as many quantiles as data points.</p>
-<p>Note that although here we used qqplots to compare an observed distribution to the mathamatically defeinde normal distribution, QQ-plots can be used to compare any two distributions.</p>
+<p>In the illustration above we used 20 quantiles; however, the default for the <code>geom_qq</code> function is to use as many quantiles as data points.</p>
+<p>Although here we used qqplots to compare an observed distribution to the mathematically defined normal distribution, qqplots can be used to compare any two distributions.</p>
 </section><section id="percentiles" class="level2" data-number="1.8"><h2 data-number="1.8" class="anchored" data-anchor-id="percentiles">
 <span class="header-section-number">1.8</span> Percentiles</h2>
 <p>Before we move on, let’s define some terms that are commonly used in exploratory data analysis.</p>
-<p><em>Percentiles</em> are special cases of <em>quantiles</em> that are commonly used. The percentiles are the quantiles you obtain when setting the <span class="math inline">\(p\)</span> at <span class="math inline">\(0.01, 0.02, ..., 0.99\)</span>. We call, for example, the case of <span class="math inline">\(p = 0.25\)</span> the 25th percentile, which gives us a number for which 25% of the data is below. The most famous percentile is the 50th, also known as the <em>median</em>.</p>
-<p>For the normal distribution the <em>median</em> and average are the same, but this is generally not the case.</p>
+<p><em>Percentiles</em> are special cases of <em>quantiles</em> that are commonly used. The percentiles are the quantiles you obtain when setting the <span class="math inline">\(p\)</span> at <span class="math inline">\(0.01, 0.02, ..., 0.99\)</span>. For example, we refer to the case of <span class="math inline">\(p = 0.25\)</span> as the 25th percentile, representing a value below which 25% of the data falls. The most famous percentile is the 50th, also known as the <em>median</em>.</p>
+<p>For the normal distribution, the <em>median</em> and average are the same, but this is generally not the case.</p>
 <p>Another special case that receives a name are the <em>quartiles</em>, which are obtained when setting <span class="math inline">\(p = 0.25,0.50\)</span>, and <span class="math inline">\(0.75\)</span>.</p>
 </section><section id="boxplots" class="level2" data-number="1.9"><h2 data-number="1.9" class="anchored" data-anchor-id="boxplots">
 <span class="header-section-number">1.9</span> Boxplots</h2>
-<p>To introduce boxplots we will use a dataset of US murders by state. Suppose we want to summarize the murder rate distribution. Using the techniques we have learned, we can quickly see that the normal approximation does not apply here:</p>
+<p>To introduce boxplots, we will use a dataset of US murders by state. Suppose we want to summarize the murder rate distribution. Using the techniques we have learned, we can quickly see that the normal approximation does not apply in this case:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/summaries-hist-qqplot-non-normal-data_c5377de7cf9fb8830a6a8a12e238019c">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -752,9 +758,9 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>In this case, the histogram above or a smooth density plot would serve as a relatively succinct summary.</p>
+<p>In this instance, the histogram above or a smooth density plot would serve as a relatively succinct summary.</p>
 <p>Now suppose those used to receiving just two numbers as summaries ask us for a more compact numerical summary.</p>
-<p>The boxplot provides a five-number summary composed of the range along with the quartiles (the 25th, 50th, and 75th percentiles). The boxplot often ignore <em>outliers</em> when computing the range and instead plot these as independent points. We provide a detailed explanation of outliers later. Finally, he suggested we plot these numbers as a “box” with “whiskers” like this:</p>
+<p>The boxplot provides a five-number summary composed of the range along with the quartiles (the 25th, 50th, and 75th percentiles). The boxplot often ignores <em>outliers</em> when computing the range and instead plots these as independent points. We will provide a detailed explanation of outliers later. Finally, we plot these numbers as a “box” with “whiskers” like this:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/summaries-first-boxplot_ea9e17d2e77ff9c7db4e1c6f7f0c3ea3">
 <div class="cell-output-display">
 <div class="quarto-figure quarto-figure-center">
@@ -763,13 +769,13 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>with the box defined by the 25% and 75% percentile and the whiskers showing the range. The distance between these two is called the <em>interquartile</em> range. The two points are considered outliers by the default R function we used. The median is shown with a horizontal line. Today, we call these <em>boxplots</em>.</p>
+<p>with the box defined by the 25% and 75% percentile and the whiskers showing the range. The distance between these two is called the <em>interquartile</em> range. The two points are considered outliers by the default R function we used. The median represented by a horizontal line. Today, we call these <em>boxplots</em>.</p>
 <p>From just this simple plot, we know that the median is about 2.5, that the distribution is not symmetric, and that the range is 0 to 5 for the great majority of states with two exceptions.</p>
 </section><section id="sec-stratification" class="level2" data-number="1.10"><h2 data-number="1.10" class="anchored" data-anchor-id="sec-stratification">
 <span class="header-section-number">1.10</span> Stratification</h2>
-<p>In data analysis we often divide observations into groups based on the values of one or more variables associated with those observations. For example in the next section we divide the height values into groups based on a sex variable: females and males. We call this procedure <em>stratification</em> and refer to the resulting groups as <em>strata</em>.</p>
+<p>In data analysis, we often divide observations into groups based on the values of one or more variables associated with those observations. For example, in the next section, we divide the height values into groups based on a sex variable: females and males. We call this procedure <em>stratification</em> and refer to the resulting groups as <em>strata</em>.</p>
 <p>Stratification is common in data visualization because we are often interested in how the distribution of variables differs across different subgroups.</p>
-<p>Using the histogram, density plots, and QQ-plots, we have become convinced that the male height data is well approximated with a normal distribution. In this case, we report back to ET a very succinct summary: male heights follow a normal distribution with an average of 69.3 inches and a SD of 3.6 inches. With this information, ET will have a good idea of what to expect when he meets our male students. However, to provide a complete picture we need to also provide a summary of the female heights.</p>
+<p>Using the histogram, density plots, and qqplots, we have become convinced that the male height data is well approximated with a normal distribution. In this case, we report back to ET a very succinct summary: male heights follow a normal distribution with an average of 69.3 inches and a SD of 3.6 inches. With this information, ET will have a good idea of what to expect when he meets our male students. However, to provide a complete picture we need to also provide a summary of the female heights.</p>
 <p>We learned that boxplots are useful when we want to quickly compare two or more distributions. Here are the heights for men and women:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/summaries-female-male-boxplots_021883fee2264302c67daa95642ede38">
 <div class="sourceCode" id="cb23"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/ggplot.html">ggplot</a></span><span class="op">(</span><span class="fu"><a href="https://ggplot2.tidyverse.org/reference/aes.html">aes</a></span><span class="op">(</span><span class="va">sex</span>, <span class="va">height</span>, fill <span class="op">=</span> <span class="va">sex</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu"><a href="https://ggplot2.tidyverse.org/reference/geom_boxplot.html">geom_boxplot</a></span><span class="op">(</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -789,9 +795,9 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 </div>
 </div>
 </div>
-<p>We see something we did not see for the males: the density plot has a second <em>bump</em>. Also, the QQ-plot shows that the highest points tend to be taller than expected by the normal distribution. Finally, we also see five points in the QQ-plot that suggest shorter than expected heights for a normal distribution. When reporting back to ET, we might need to provide a histogram rather than just the average and standard deviation for the female heights.</p>
+<p>We see something we did not see for the males: the density plot has a second <em>bump</em>. Also, the qqplot shows that the highest points tend to be taller than expected by the normal distribution. Finally, we also see five points in the qqplot that suggest shorter than expected heights for a normal distribution. When reporting back to ET, we might need to provide a histogram rather than just the average and standard deviation for the female heights.</p>
 <p>We have noticed what we didn’t expect to see. If we look at other female height distributions, we do find that they are well approximated with a normal distribution. So why are our female students different? Is our class a requirement for the female basketball team? Are small proportions of females claiming to be taller than they are? Another, perhaps more likely, explanation is that in the form students used to enter their heights, <code>Female</code> was the default sex and some males entered their heights, but forgot to change the sex variable. In any case, data visualization has helped discover a potential flaw in our data.</p>
-<p>Regarding the five smallest values, note that these values are:</p>
+<p>Regarding the five smallest values, note that these are:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/unnamed-chunk-17_2c2ac58e8fffae901ae2fb9932ea3924">
 <div class="sourceCode" id="cb24"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">heights</span> <span class="op">|&gt;</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html">filter</a></span><span class="op">(</span><span class="va">sex</span> <span class="op">==</span> <span class="st">"Female"</span><span class="op">)</span> <span class="op">|&gt;</span> </span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/top_n.html">top_n</a></span><span class="op">(</span><span class="fl">5</span>, <span class="fu"><a href="https://dplyr.tidyverse.org/reference/desc.html">desc</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span><span class="op">)</span> <span class="op">|&gt;</span></span>
@@ -918,7 +924,7 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 <li>They are the same dataset, but the first and second plot undersmooth and the third oversmooths.</li>
 <li>They are the same dataset, but the first is not in the log scale, the second undersmooths, and the third oversmooths.</li>
 </ol>
-<p>11. Define variables containing the heights of males and females like this:</p>
+<p>11. Define variables containing the heights of males and females as follows:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/unnamed-chunk-18_71d7cb157f937d9e8c18eb37a8119c9c">
 <div class="sourceCode" id="cb25"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
 <span><span class="va">male</span> <span class="op">&lt;-</span> <span class="va">heights</span><span class="op">$</span><span class="va">height</span><span class="op">[</span><span class="va">heights</span><span class="op">$</span><span class="va">sex</span> <span class="op">==</span> <span class="st">"Male"</span><span class="op">]</span></span>
@@ -936,7 +942,7 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 </div>
 </div>
 <p>Which continent has the country with the biggest population size?</p>
-<p>14. What continent has the largest median population size?</p>
+<p>14. Which continent has the largest median population size?</p>
 <p>15. What is median population size for Africa to the nearest million?</p>
 <p>16. What proportion of countries in Europe have populations below 14 million?</p>
 <ol type="a">
@@ -946,24 +952,24 @@ <h1 class="title"><span id="sec-summaries-distributions" class="quarto-section-i
 <li>0.25</li>
 </ol>
 <p>17. If we use a log transformation, which continent shown above has the largest interquartile range?</p>
-<p>18. Load the height data set and create a vector <code>x</code> with just the male heights:</p>
+<p>18. Load the height dataset and create a vector <code>x</code> with just the male heights:</p>
 <div class="cell" data-layout-align="center" data-hash="distributions_cache/html/unnamed-chunk-19_82cf5c3194f75ecc3f35a3c0f0bac413">
 <div class="sourceCode" id="cb26"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">dslabs</span><span class="op">)</span></span>
 <span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">heights</span><span class="op">$</span><span class="va">height</span><span class="op">[</span><span class="va">heights</span><span class="op">$</span><span class="va">sex</span><span class="op">==</span><span class="st">"Male"</span><span class="op">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>What proportion of the data is between 69 and 72 inches (taller than 69, but shorter or equal to 72)? Hint: use a logical operator and <code>mean</code>.</p>
 <p>19. Suppose all you know about the data is the average and the standard deviation. Use the normal approximation to estimate the proportion you just calculated. Hint: start by computing the average and standard deviation. Then use the <code>pnorm</code> function to predict the proportions.</p>
-<p>20. Notice that the approximation calculated in question nine is very close to the exact calculation in the first question. Now perform the same task for more extreme values. Compare the exact calculation and the normal approximation for the interval (79,81]. How many times bigger is the actual proportion than the approximation?</p>
-<p>21. Approximate the distribution of adult men in the world as normally distributed with an average of 69 inches and a standard deviation of 3 inches. Using this approximation, estimate the proportion of adult men that are 7 feet tall or taller, referred to as <em>seven footers</em>. Hint: use the <code>pnorm</code> function.</p>
+<p>20. Notice that the approximation calculated in question nine is very close to the exact calculation in the first question. Now perform the same task for more extreme values. Compare the exact calculation and the normal approximation for the interval (79,81]. How many times larger is the actual proportion than the approximation?</p>
+<p>21. Approximate the distribution of adult men in the world as normally distributed with an average of 69 inches and a standard deviation of 3 inches. Using this approximation, estimate the proportion of adult men that are 7 feet tall or taller, referred to as <em>seven-footers</em>. Hint: use the <code>pnorm</code> function.</p>
 <p>22. There are about 1 billion men between the ages of 18 and 40 in the world. Use your answer to the previous question to estimate how many of these men (18-40 year olds) are seven feet tall or taller in the world?</p>
-<p>23. There are about 10 National Basketball Association (NBA) players that are 7 feet tall or higher. Using the answer to the previous two questions, what proportion of the world’s 18-to-40-year-old <em>seven footers</em> are in the NBA?</p>
+<p>23. There are about 10 National Basketball Association (NBA) players that are 7 feet tall or higher. Using the answer to the previous two questions, what proportion of the world’s 18-to-40-year-old <em>seven-footers</em> are in the NBA?</p>
 <p>14. Repeat the calculations performed in the previous question for Lebron James’ height: 6 feet 8 inches. There are about 150 players that are at least that tall.</p>
-<p>25. In answering the previous questions, we found that it is not at all rare for a seven footer to become an NBA player. What would be a fair critique of our calculations:</p>
+<p>25. In answering the previous questions, we found that it is not uncommon for a seven-footer to become an NBA player. What would be a fair critique of our calculations:</p>
 <ol type="a">
 <li>Practice and talent are what make a great basketball player, not height.</li>
 <li>The normal approximation is not appropriate for heights.</li>
-<li>As seen in question 10, the normal approximation tends to underestimate the extreme values. It’s possible that there are more seven footers than we predicted.</li>
-<li>As seen in question 10, the normal approximation tends to overestimate the extreme values. It’s possible that there are fewer seven footers than we predicted.</li>
+<li>As seen in question 10, the normal approximation tends to underestimate the extreme values. It’s possible that there are more seven-footers than we predicted.</li>
+<li>As seen in question 10, the normal approximation tends to overestimate the extreme values. It’s possible that there are fewer seven-footers than we predicted.</li>
 </ol>
 
 
diff --git a/docs/summaries/intro-summaries.html b/docs/summaries/intro-summaries.html
index 792e371..ad49c02 100644
--- a/docs/summaries/intro-summaries.html
+++ b/docs/summaries/intro-summaries.html
@@ -203,23 +203,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -236,37 +242,37 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -283,31 +289,31 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -324,49 +330,49 @@
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -400,7 +406,7 @@ <h1 class="title">Summary statistics</h1>
 </header>
 
 <p>We start by describing a simple yet powerful data analysis technique: constructing data summaries. Although the approach does not require mathematical models or probability, the motivation for the summaries we describe will later help us understand both these topics.</p>
-<p>You have likely noticed that numerical data is often summarized with the <em>average</em> value. For example, the quality of a high school is sometimes summarized with one number: the average score on a standardized test. Occasionally, a second number is reported: the <em>standard deviation</em>. For example, you might read a report stating that scores were 680 plus or minus 50, with 50 the standard deviation. The report has summarized the entirety of scores with just two numbers. Is this appropriate? Is there any important piece of information that we are missing by only looking at this summary rather than the entire list? Here we answer these questions and motivate several useful summary statistics and plots, including the average, standard deviation, median, quartiles, histograms, and density plots.</p>
+<p>You have likely noticed that numerical data is often summarized with the <em>average</em> value. For example, the quality of a high school is sometimes summarized with one number: the average score on a standardized test. Occasionally, a second number is reported: the <em>standard deviation</em>. For example, you might read a report stating that scores were 680 plus or minus 50, with 50 the standard deviation. The report has summarized the entirety of scores with just two numbers. Is this appropriate? Is there any important piece of information that we are missing by only looking at this summary rather than the entire list? In this section, we answer these questions and motivate several useful summary statistics and plots, including the average, standard deviation, median, quartiles, histograms, and density plots.</p>
 
 
 
diff --git a/docs/summaries/robust-summaries.html b/docs/summaries/robust-summaries.html
index 70933ab..1ffd8e1 100644
--- a/docs/summaries/robust-summaries.html
+++ b/docs/summaries/robust-summaries.html
@@ -223,23 +223,29 @@
   <a href="../inference/hypothesis-testing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Hypothesis testing</span></span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../inference/bootstrap.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Bootstrap</span></span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Data-driven models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/bayes.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Bayesian statistics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../inference/hierarchical-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Hierarchichal Models</span></span></a>
   </div>
 </li>
       </ul>
@@ -256,37 +262,37 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/multivariate-regression.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Multivariate Regression</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/measurement-error-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Measurement error models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/treatment-effect-models.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Treatment effect models</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-tests.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association tests</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../linear-models/association-not-causation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Association is not causation</span></span></a>
   </div>
 </li>
       </ul>
@@ -303,31 +309,31 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrices-in-R.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Matrices in R</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/linear-algebra.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
+ <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Applied Linear Algebra</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/dimension-reduction.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">21</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
+ <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Dimension reduction</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/regularization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">22</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Regularization</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../highdim/matrix-factorization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">23</span>&nbsp; <span class="chapter-title">Matrix factorization</span></span></a>
+ <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Matrix Factorization</span></span></a>
   </div>
 </li>
       </ul>
@@ -344,49 +350,49 @@
 <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/notation-and-terminology.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">24</span>&nbsp; <span class="chapter-title">Notation and Terminology</span></span></a>
+ <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Notation and terminology</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/evaluation-metrics.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">25</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
+ <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Evaluation metrics</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/conditionals.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">26</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
+ <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Conditional probabilities and expectations</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/smoothing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">27</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
+ <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Smoothing</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/cross-validation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">28</span>&nbsp; <span class="chapter-title">Cross validation</span></span></a>
+ <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Resampling methods</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/algorithms.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">29</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
+ <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Examples of algorithms</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/ml-in-practice.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">30</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
+ <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Machine learning in practice</span></span></a>
   </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
   <a href="../ml/clustering.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text"><span class="chapter-number">31</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
+ <span class="menu-text"><span class="chapter-number">32</span>&nbsp; <span class="chapter-title">Clustering</span></span></a>
   </div>
 </li>
       </ul>
@@ -422,11 +428,11 @@ <h1 class="title"><span id="sec-robust-summaries" class="quarto-section-identifi
   </div>
   
 
-</header><p>Note that the heights we explored in the Chapter <a href="distributions.html"><span>Chapter&nbsp;1</span></a> are not the original heights reported by students. A second challenge involves exploring the <em>original</em> reported heights, which are also included in the <strong>dslabs</strong> package in the <code>reported_heights</code> object. We will see that due to errors in reporting, using <em>robust summaries</em> are necessary to produce useful summaries.</p>
+</header><p>Note that the heights we explored in the <a href="distributions.html"><span>Chapter&nbsp;1</span></a> are not the original heights reported by students. A second challenge involves exploring the <em>original</em> reported heights, which are also included in the <strong>dslabs</strong> package in the <code>reported_heights</code> object. We will see that due to errors in reporting, using <em>robust summaries</em> are necessary to produce useful summaries.</p>
 <section id="outliers" class="level2" data-number="2.1"><h2 data-number="2.1" class="anchored" data-anchor-id="outliers">
 <span class="header-section-number">2.1</span> Outliers</h2>
 <p>We previously described how boxplots show <em>outliers</em>, but we did not provide a precise definition. Here we discuss outliers, approaches that can help detect them, and summaries that take into account their presence.</p>
-<p>Outliers are very common in real-world data anlysis. Data recording can be complex and it is common to observe data points generated in error. For example, an old monitoring device may read out nonsensical measurements before completely failing. Human error is also a source of outliers, in particular when data entry is done manually. An individual, for instance, may mistakenly enter their height in centimeters instead of inches or put the decimal in the wrong place.</p>
+<p>Outliers are very common in real-world data analysis. Data recording can be complex and it is common to observe data points generated in error. For example, an old monitoring device may read out nonsensical measurements before completely failing. Human error is also a source of outliers, in particular when data entry is done manually. For example, an individual may mistakenly enter their height in centimeters instead of inches or put the decimal in the wrong place.</p>
 <p>How do we distinguish an outlier from measurements that were too big or too small simply due to expected variability? This is not always an easy question to answer, but we try to provide some guidance. Let’s begin with a simple case.</p>
 <p>Suppose a colleague is charged with collecting demography data for a group of males. The data report height in feet and are stored in the object:</p>
 <div class="cell" data-layout-align="center">
@@ -434,7 +440,7 @@ <h1 class="title"><span id="sec-robust-summaries" class="quarto-section-identifi
 <span><span class="fu"><a href="https://rdrr.io/r/utils/str.html">str</a></span><span class="op">(</span><span class="va">outlier_example</span><span class="op">)</span></span>
 <span><span class="co">#&gt;  num [1:500] 5.59 5.8 5.54 6.15 5.83 5.54 5.87 5.93 5.89 5.67 ...</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>Our colleague uses the fact that heights are usually well approximated by a normal distribution and summarizes the data with average and standard deviation</p>
+<p>Our colleague uses the fact that heights are usually well approximated by a normal distribution and summarizes the data with average and standard deviation:</p>
 <div class="cell" data-layout-align="center" data-hash="robust-summaries_cache/html/unnamed-chunk-2_59d3fe6305112a4ab3abdea1b7e1de73">
 <div class="sourceCode" id="cb2"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">outlier_example</span><span class="op">)</span></span>
 <span><span class="co">#&gt; [1] 6.1</span></span>
@@ -466,7 +472,7 @@ <h1 class="title"><span id="sec-robust-summaries" class="quarto-section-identifi
 <p>The median is what boxplots display as a horizontal line.</p>
 </section><section id="the-inter-quartile-range-iqr" class="level2" data-number="2.3"><h2 data-number="2.3" class="anchored" data-anchor-id="the-inter-quartile-range-iqr">
 <span class="header-section-number">2.3</span> The inter quartile range (IQR)</h2>
-<p>The box in boxplots is defined by the first and third quartile. These are meant to provide an idea of the variability in the data: 50% of the data is within this range. The difference between the 3rd and 1st quartile (or 75th and 25th percentiles) is referred to as the inter quartile range (IQR). As is the case with the median, this quantity will be robust to outliers as large values do not affect it. We can do some math to see that for normally distributed data, the IQR / 1.349 approximates the standard deviation of the data had an outlier not been present. We can see that this works well in our example since we get a standard deviation estimate of:</p>
+<p>The box in boxplots is defined by the first and third quartile. These are meant to provide an idea of the variability in the data: 50% of the data is within this range. The difference between the 3rd and 1st quartile (or 75th and 25th percentiles) is referred to as the inter quartile range (IQR). As is the case with the median, this quantity will be robust to outliers as large values do not affect it. We can do some math to see that for normally distributed data, the IQR / 1.349 approximates the standard deviation of the data had an outlier not been present. We can see that this works well in our example, since we get a standard deviation estimate of:</p>
 <div class="cell" data-layout-align="center" data-hash="robust-summaries_cache/html/unnamed-chunk-5_a29e6b6b8e966d0850c686bc9f8e5a94">
 <div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/stats/IQR.html">IQR</a></span><span class="op">(</span><span class="va">outlier_example</span><span class="op">)</span> <span class="op">/</span> <span class="fl">1.349</span></span>
 <span><span class="co">#&gt; [1] 0.245</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -519,7 +525,7 @@ <h1 class="title"><span id="sec-robust-summaries" class="quarto-section-identifi
 <p>which is about 3 inches.</p>
 </section><section id="exercises" class="level2" data-number="2.6"><h2 data-number="2.6" class="anchored" data-anchor-id="exercises">
 <span class="header-section-number">2.6</span> Exercises</h2>
-<p>We are going to use the <strong>HistData</strong> package. Load the height data set and create a vector <code>x</code> with just the male heights used in Galton’s data on the heights of parents and their children from his historic research on heredity.</p>
+<p>We are going to use the <strong>HistData</strong> package. Load the height data set and create a vector <code>x</code>, consisting solely of the male heights used in Galton’s data on the heights of parents and their children, part of his historic research on heredity.</p>
 <div class="cell" data-layout-align="center" data-hash="robust-summaries_cache/html/unnamed-chunk-10_496c326c146d88c3c5b448202c47c020">
 <div class="sourceCode" id="cb10"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va">HistData</span><span class="op">)</span></span>
 <span><span class="va">x</span> <span class="op">&lt;-</span> <span class="va">Galton</span><span class="op">$</span><span class="va">child</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -539,21 +545,21 @@ <h1 class="title"><span id="sec-robust-summaries" class="quarto-section-identifi
 <ol type="a">
 <li>Since it is only one value out of many, we will not be able to detect this.</li>
 <li>We would see an obvious shift in the distribution.</li>
-<li>A boxplot, histogram, or qq-plot would reveal a clear outlier.</li>
+<li>A boxplot, histogram, or qqplot would reveal a clear outlier.</li>
 <li>A scatterplot would show high levels of measurement error.</li>
 </ol>
 <p>8. How much can the average accidentally grow with mistakes like this? Write a function called <code>error_avg</code> that takes a value <code>k</code> and returns the average of the vector <code>x</code> after the first entry changed to <code>k</code>. Show the results for <code>k=10000</code> and <code>k=-10000</code>.</p>
 <p>9. Using the <code>murders</code> dataset in the <strong>dslabs</strong> package. Compute the murder rate for each state. Make a boxplot comparing the murder rates for each region of the United States.</p>
 <p>10. For the same dataset, compute the median and IQR murder rate for each region.</p>
-<p>11. Add a column to the <code>reported_heights</code> with the year the height was entered. You can use the <code>year</code> function in the <strong>lubridate</strong> package to extract the year from <code>reported_heights$time_stamp)</code>. Change the <code>height</code> column from characters to numbers using <code>parse_number</code> from the <strong>readr</strong> package. Some of the heights will be converted to <code>NA</code> becuase they were incorrectly entetered and include characters, for example <code>165cm</code>. These heights were supposed to be reported in inches, but many clearly did not. Convert any entry below 54 or above 72 to <code>NA</code> using the <code>na_if</code> function from <strong>dplyr</strong>. Once you dod this stratify by sex and year and report the percentage of incorrectly entered heights, represented by the <code>NA</code>.</p>
-<p>12. The heights we have been looking at are not the original heights reported by students. The original reported heights are also included in the <strong>dslabs</strong> package in the object <code>reported_heights</code>. Note that the <code>height</code> column in this data frame is a character and if we try to create a new column with the numeric version</p>
+<p>11. Add a column to the <code>reported_heights</code> with the year the height was entered. You can use the <code>year</code> function in the <strong>lubridate</strong> package to extract the year from <code>reported_heights$time_stamp)</code>. Change the <code>height</code> column from characters to numbers using <code>parse_number</code> from the <strong>readr</strong> package. Some of the heights will be converted to <code>NA</code> because they were incorrectly entered and include characters, for example <code>165cm</code>. These heights were supposed to be reported in inches, but many clearly did not. Convert any entry below 54 or above 72 to <code>NA</code> using the <code>na_if</code> function from <strong>dplyr</strong>. Once you do this, stratify by sex and year and report the percentage of incorrectly entered heights, represented by the <code>NA</code>.</p>
+<p>12. The heights we have been looking at are not the original heights reported by students. The original reported heights are also included in the <strong>dslabs</strong> package in the object <code>reported_heights</code>. Note that the <code>height</code> column in this data frame is a character, and if we try to create a new column with the numeric version:</p>
 <div class="cell" data-layout-align="center" data-hash="robust-summaries_cache/html/unnamed-chunk-12_57aa6e7c9394112db2c46c61f43cbad0">
 <div class="sourceCode" id="cb12"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html">library</a></span><span class="op">(</span><span class="va"><a href="https://tidyverse.tidyverse.org">tidyverse</a></span><span class="op">)</span>  </span>
 <span><span class="va">reported_heights</span> <span class="op">&lt;-</span> <span class="va">reported_heights</span> <span class="op">|&gt;</span></span>
 <span>  <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html">mutate</a></span><span class="op">(</span>original_heights <span class="op">=</span> <span class="va">height</span>, height <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/numeric.html">as.numeric</a></span><span class="op">(</span><span class="va">height</span><span class="op">)</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<p>we get a warnings about NAs. Take a look at the rows that result in NAs and describe why this is happeining. Others used centimeters and others were just trolling.</p>
-<p>13. Remove these entries the result in NAs after attempting to convert heights to numbers. Compute the mean, standard deviation, median, and MAD by sex. What do you notice?</p>
+<p>we get a warnings about NAs. Examine the rows that result in NAs and describe why this is happening.</p>
+<p>13. Remove the entries that result in NAs when attempting to convert heights to numbers. Compute the mean, standard deviation, median, and MAD by sex. What do you notice?</p>
 <p>14. Generate boxplots summarizing the heights for males and females and describe what you see.</p>
 <p>15. Look at the largest 10 heights and provide a hypothesis for what you think is happening.</p>
 <p>16. Review all the nonsensical answers by looking at the data considered to be <em>far out</em> by Tukey and comment on the type of errors you see.</p>
diff --git a/highdim/dimension-reduction.qmd b/highdim/dimension-reduction.qmd
index 7c92529..51dc85d 100644
--- a/highdim/dimension-reduction.qmd
+++ b/highdim/dimension-reduction.qmd
@@ -76,7 +76,7 @@ plot(dist(x) / sqrt(2), dist(z))
 abline(0, 1, col = "red")
 ```
 
-FIX (does ok -> is ok?) This one number summary does ok at preserving distances, but, can we pick a one-dimensional summary that improves the approximation?
+This one number summary does ok at preserving distances, but, can we pick a one-dimensional summary that improves the approximation?
 
 If we look back at the scatterplot and visualize a line between any pair of points, the length of this line is the distance between the two points. These lines tend to go along the direction of the diagonal. We will learn that we can *rotate* the points in a way that preserve the distance between points, while increasing the variability in one dimension and reducing it on the other. Using this method, we keep more of the *information* about distances in the first dimension. In the next section, we describe a mathematical approach that permits us to find rotations that preserve distance between points. We can then find the rotation that maximizes the variability in the first dimension.
 
@@ -90,7 +90,7 @@ $$
 
 with $r$ the length of the hypotenuse and $\phi$ the angle between the hypotenuse and the x-axis.
 
-FIX We can *rotate* the point $(x_1, x_2)^\top$ around a circle with center $(0,0)^\top$ and radius $r$ by an angle $\theta$ by changing the angle in the previous equation to $\phi + \theta$:
+To *rotate* the point $(x_1, x_2)^\top$ around a circle with center $(0,0)^\top$ and radius $r$ by an angle $\theta$ we simply change the angle in the previous equation to $\phi + \theta$:
 
 $$
 z_1 = r \cos(\phi+ \theta), \,\,
@@ -128,10 +128,10 @@ draw.circle(theta, start = phi, r = 0.3)
 We can use trigonometric identities to rewrite $(z_1, z_2)$ as follows:
 
 $$
-\begin{align}
+\begin{aligned}
 z_1 = r \cos(\phi + \theta) = r \cos \phi \cos\theta -  r \sin\phi \sin\theta =  x_1 \cos(\theta) -  x_2 \sin(\theta)\\
 z_2 = r \sin(\phi + \theta) =  r \cos\phi \sin\theta + r \sin\phi \cos\theta =  x_1 \sin(\theta) + x_2 \cos(\theta)
-\end{align}
+\end{aligned}
 $$
 
 Now we can rotate each point in the dataset by simply applying the formula above to each pair $(x_{i,1}, x_{i,2})^\top$. Here is what the twin standardized heights look like after rotating each point by $-45$ degrees:
@@ -140,12 +140,12 @@ Now we can rotate each point in the dataset by simply applying the formula above
 z <- cbind((x[,1] + x[,2]) / sqrt(2), (x[,2] - x[,1]) / sqrt(2))
 lim <- range(z)
 rafalib::mypar(1,2)
-plot(x, xlim=lim, ylim = lim)
+plot(x, xlim = lim, ylim = lim)
 lines(x[c(1,2),], col = "blue", lwd = 2)
 lines(x[c(2,51),], col = "red", lwd = 2)
 points(x[c(1,2,51),], pch = 16)
 
-plot(z, xlim=lim, ylim = lim)
+plot(z, xlim = lim, ylim = lim)
 lines(z[c(1,2),], col = "blue", lwd = 2)
 lines(z[c(2,51),], col = "red", lwd = 2)
 points(z[c(1,2,51),], pch = 16)
@@ -157,7 +157,9 @@ Note that while the variability of $x_1$ and $x_2$ are similar, the variability
 
 Any time a matrix $\mathbf{X}$ is multiplied by another matrix $\mathbf{A}$, we refer to the product $\mathbf{Z} = \mathbf{X}\mathbf{A}$ as a linear transformation of $\mathbf{X}$. Below, we show that the rotations described above are a linear transformation. To see this, note that for any row $i$, the first entry was:
 
-$$z_{i,1} = a_{1,1} x_{i,1} + a_{2,1} x_{i,2}$$
+$$
+z_{i,1} = a_{1,1} x_{i,1} + a_{2,1} x_{i,2}
+$$
 
 with $a_{1,1} = \cos\theta$ and $a_{2,1} = -\sin\theta$.
 
@@ -375,9 +377,8 @@ z <- x %*% V
 The following animation further illustrates how different rotations affect the variability explained by the dimensions of the rotated data:
 
 ```{r, echo=FALSE, fig.asp=1}
-#if(knitr::is_html_output()){
-if (TRUE) {
-  if(!file.exists("img/pca.gif")){
+if (knitr::is_html_output()) {
+  if (!file.exists("img/pca.gif")) {
     angles <- c(rep(0, 5),
                 seq(0, 45, 1),
                 rep(45, 15),
@@ -385,8 +386,8 @@ if (TRUE) {
                 seq(90, 45, -1),
                 rep(45, 15),
                 seq(45, 0, -1))
-    stop_ind <- which(angles==45)
-    thetas <- - 2*pi * angles/360
+    stop_ind <- which(angles == 45)
+    thetas <- -2*pi*angles/360
     lim <- c(-4, 4)
     dlim <- c(0, 1.25)
     bw <- 0.3
@@ -394,8 +395,9 @@ if (TRUE) {
     library(animation)
     saveGIF({
       rafalib::mypar(2,2)
-      for(i in seq_along(angles)){
-        A <- matrix(c(cos(thetas[i]), -sin(thetas[i]), sin(thetas[i]), cos(thetas[i])), 2, 2)
+      for (i in seq_along(angles)) {
+        A <- matrix(c(cos(thetas[i]), -sin(thetas[i]), 
+                      sin(thetas[i]), cos(thetas[i])), 2, 2)
         z <- x %*% A
         sds <- apply(z, 2, sd)
         plot(z, xlim = lim, ylim = lim,
@@ -418,12 +420,14 @@ if (TRUE) {
           text(-1.95, -0.5, expression(sqrt(2)), cex = 2)
           text(0, 2.2, "This rotation gives the\nprincipal components")
         } else{
-          the_text <- paste0(c(" cos ", "sin ", "-sin ", " cos "), format(angles[i], width = 2), "\u00B0")
-          text(0.75*c(-1, -1,1,1),c(1,-1,1,-1), the_text, cex=1.15, font = 2)
+          the_text <- paste0(c(" cos ", "sin ", "-sin ", " cos "),
+                             format(angles[i], width = 2), "\u00B0")
+          text(0.75*c(-1, -1,1,1),c(1,-1,1,-1), the_text, 
+               cex = 1.15, font = 2)
           text(c(-1.75, 1.75), c(0,0), c("(",")"), cex = 8)
         }
       }
-    }, interval=0.1, ani.width=600, ani.height=600, ani.res=90,
+    }, interval = 0.1, ani.width = 600, ani.height = 600, ani.res = 90,
     loop = TRUE, movie.name = "pca.gif")
     system(paste("mv pca.gif", file.path("img", "pca.gif")))
   }
@@ -431,6 +435,29 @@ if (TRUE) {
 }
 ```
 
+```{r pca-demo, echo=FALSE}
+if (!knitr::is_html_output()) {
+  library(ggExtra)
+  library(gridExtra)
+  thetas <- c(0,-45)*2*pi/360
+  ps <- vector("list", 2)
+  for (i in 1:2) {
+    A <- matrix(c(cos(thetas[i]), -sin(thetas[i]), 
+                  sin(thetas[i]), cos(thetas[i])), 2, 2)
+    z <- x %*% A
+    sds <- apply(z, 2, sd)
+    lim <- c(-4, 4)
+    p <- data.frame(x1 = z[,1], x2 = z[,2]) |>  
+      ggplot(aes(x1, x2)) + geom_point() +
+      xlim(lim) + ylim(lim) + 
+      xlab(paste("Dimension 1 SD =",format(round(sds[1],2)))) +
+      ylab(paste("Dimension 2 SD =",format(round(sds[2],2))))
+    ps[[i]] <- ggMarginal(p, bw = 0.375, fill = "grey")
+  }
+  grid.arrange(ps[[1]], ps[[2]], ncol = 2)
+}
+```
+
 
 The first dimension of `z` is referred to as the *first principal component (PC)*. Because almost all the variation is explained by this first PC, the distance between rows in `x` can be very well approximated by the distance calculated with just `z[,1]`.
 
@@ -499,13 +526,13 @@ illustrate_pca(x, flip = -1)
 ```
 
 
-This idea generalizes to dimensions higher than 2. FIX As demonstrated in our two dimensional example, we start by finding the $p\times1$ vector $\mathbf{v}_1$ with$||\mathbf{v}_1||=1$ that maximizes $||\mathbf{X} \mathbf{v}_1||$. $\mathbf{X} \mathbf{a}_1$ is the first PC. To find the second PC, we subtract the variation explained by first PC from $\mathbf{X}$:
+This idea generalizes to dimensions higher than 2. As done in our two dimensional example, we start by finding the $p \times 1$ vector $\mathbf{v}_1$ with $||\mathbf{v}_1||=1$ that maximizes $||\mathbf{X} \mathbf{v}_1||$. The projection $\mathbf{X} \mathbf{v}_1$ is the first PC. To find the second PC, we subtract the variation explained by first PC from $\mathbf{X}$:
 
 $$
 \mathbf{r} = \mathbf{X} - \mathbf{X} \mathbf{v}_1 \mathbf{v}_1^\top
 $$
 
-and then find the vector $\mathbf{v}_2$ with$||\mathbf{v}_2||=1$ that maximizes $||\mathbf{r} \mathbf{v}_2||$. $\mathbf{X} \mathbf{v}_2$ is the second PC. We then subtract the variation explained by the first two PCs, and continue this process until we have the entire *rotation* matrix and matrix of principal components, respectively:
+and then find the vector $\mathbf{v}_2$ with$||\mathbf{v}_2||=1$ that maximizes $||\mathbf{r} \mathbf{v}_2||$. The projection $\mathbf{X} \mathbf{v}_2$ is the second PC. We then subtract the variation explained by the first two PCs, and continue this process until we have the entire *rotation* matrix and matrix of principal components, respectively:
 
 $$
 \mathbf{V} =
@@ -515,12 +542,11 @@ $$
 \mathbf{Z} = \mathbf{X}\mathbf{V}
 $$
 
-The ideas of distance preservation extends to higher dimensions. For a multidimensional matrix with $p$ columns, the $\mathbf{A}$ transformation preserves the distance between rows, but with the variance explained by the columns in decreasing order.
-If the variances of the columns $\mathbf{Z}_j$, $j>k$ are very small, these dimensions have little to contribute to the distance calculation and we can approximate the distance between any two points with just $k$ dimensions. If $k$ is much smaller than $p$, then we can achieve a very efficient summary of our data.
+The ideas of distance preservation extends to higher dimensions. For a multidimensional matrix with $p$ columns, the $\mathbf{A}$ transformation preserves the distance between rows, but with the variance explained by the columns in decreasing order.If the variances of the columns $\mathbf{Z}_j$, $j>k$ are very small, these dimensions have little to contribute to the distance calculation and we can approximate the distance between any two points with just $k$ dimensions. If $k$ is much smaller than $p$, then we can achieve a very efficient summary of our data.
 
 
 :::{.callout-warning} 
-Notice that the solution to this maximization problem is not unique because $||\mathbf{X} \mathbf{v}|| = ||-\mathbf{X} \mathbf{v}||$. Also, note that if we multiply a column of $\mathbf{A}$ by $-1$, we still represent $\mathbf{X}$ as $\mathbf{Z}\mathbf{V}^\top$ as long as we also multiple the corresponding column of $\matbf{V}$ by -1. FIX This implies that sign of each column of the rotation $\mathbf{V}$ and principal component matrix $\mathbf{Z}$ is arbitrary.
+Notice that the solution to this maximization problem is not unique because $||\mathbf{X} \mathbf{v}|| = ||-\mathbf{X} \mathbf{v}||$. Also, note that if we multiply a column of $\mathbf{A}$ by $-1$, we still represent $\mathbf{X}$ as $\mathbf{Z}\mathbf{V}^\top$ as long as we also multiple the corresponding column of $\mathbf{V}$ by -1. This implies that we can arbitrarily change the sign of each column of the rotation $\mathbf{V}$ and principal component matrix $\mathbf{Z}$.
 :::
 
 In R, we can find the principal components of any matrix with the function `prcomp`:
@@ -552,7 +578,7 @@ summary(pca)
 ```
 
 
-FIX We also see that we can transform between `x` ($\mathbf{X}$) and `pca$x` ($\mathbf{Z}$) as explained with mathematical formulas above:
+We also see that we can rotate `x` ($\mathbf{X}$) and `pca$x` ($\mathbf{Z}$) as explained with the mathematical formulas above:
 
 ```{r}
 all.equal(pca$x, x %*% pca$rotation)
@@ -632,8 +658,7 @@ rafalib::mypar()
 illustrate_pca(x)
 ```
 
-FIX we learn that the first PC is obtained by taking a weighted average of sepal length, petal length, and petal width, since these are red in first column, and subtract a weighted sepal width, since this is blue. The second PC is a weighted average of weighted average of petal length and petal width, minus a weighted average of sepal length and petal width. 
-
+we learn that the first PC is obtained by taking a weighted average of sepal length, petal length, and petal width (red in first column), and subtracting a a quantity proportional to sepal width (blue in first column). The second PC is a weighted average of petal length and petal width, minus a weighted average of sepal length and petal width. 
 
 ### MNIST example
 
@@ -674,7 +699,7 @@ data.frame(PC1 = pca$x[,1], PC2 = pca$x[,2], label = factor(mnist$train$label))
   geom_point(cex = 3, pch = 21)
 ```
 
-FIX We can also *see* the linear combinations on the grid to get an idea of how pixels are getting to compute the first four principal components:
+We can also _see_ the rotation values on the 28 $\times$ 28 grid to get an idea of how pixels are being weighted in the transformations that result in the PCs:
 
 ```{r mnist-pca-1-4, echo = FALSE, out.width="100%", fig.width=6, fig.height=1.75}
 library(RColorBrewer)
@@ -694,7 +719,7 @@ tmp |>
   facet_wrap(~label, nrow = 1)
 ```
 
-We can clearly see that first PC appears to be separating the 1s (red) from the 0s (blue). We can vaguely discern numbers in the other three PCs as well. By looking at the PCs stratified by digits, we get further insights. For example, we see that the second PC separates 4s, 7s, and 9s from the rest:
+We can clearly see that first PC appears to be separating the 1s (red) from the 0s (blue). We can vaguely discern digits, or parts of digits, in the other three PCs as well. By looking at the PCs stratified by digits, we get further insights. For example, we see that the second PC separates 4s, 7s, and 9s from the rest:
 
 ```{r digit-pc-boxplot}
 #| echo: false
@@ -702,7 +727,6 @@ data.frame(label = factor(mnist$train$labels), PC2 = pca$x[,2]) |>
   ggplot(aes(label, PC2)) + geom_boxplot() 
 ```
 
-
 We can also confirm that the lower variance PCs appear related to unimportant variability, mainly smudges in the corners:
 
 ```{r mnist-pca-last,, echo = FALSE, out.width="100%", fig.width=6, fig.height=1.75}
diff --git a/highdim/intro-highdim.qmd b/highdim/intro-highdim.qmd
index fab1b1a..3786828 100644
--- a/highdim/intro-highdim.qmd
+++ b/highdim/intro-highdim.qmd
@@ -5,4 +5,4 @@ There is a variety of computational techniques and statistical concepts that are
 A task that serves as motivation for this part of the book is quantifying the similarity between any two observations. For example, we might want to know how much two handwritten digits look like each other. However, note that each observation is associated with $28 \times 28 = 784$ pixels so we can't simply use subtraction as we would if our data was one dimensional.
 Instead, we will define observations as *points* in a *high-dimensional* space and mathematically define a *distance*. Many machine learning techniques, discussed in the next part of the book, require this calculation.
 
-Additionally, this part of the book discusses dimension reduction. FIX Here we search of data summaries that result in more manageable lower dimension versions of the data, but preserve most or all the *information* we need. FIX Here too we can use distance between observations as specific challenge: we will reduce the dimensions summarize the data into lower dimensions, but in a way that preserves the distance between any two observations. We use *linear algebra* as a mathematical foundation for all the techniques presented here.
+Additionally, this part of the book discusses dimension reduction. Here we search for data summaries that provide more manageable lower dimension versions of the data, but preserve most or all the *information* we need. We again use distance between observations as a specific example: we will summarize the data into lower dimensions, but in a way that preserves distance between any two observations. We use *linear algebra* as a mathematical foundation for all the techniques presented here.
diff --git a/highdim/linear-algebra.qmd b/highdim/linear-algebra.qmd
index 09a7c95..09ea5d2 100644
--- a/highdim/linear-algebra.qmd
+++ b/highdim/linear-algebra.qmd
@@ -17,11 +17,11 @@ A commonly used operation in data analysis is matrix multiplication. Here, we de
 Linear algebra originated from mathematicians developing systematic ways to solve systems of linear equations. For example:
 
 $$
-\begin{align}
+\begin{aligned}
 x +  3 y  - 2 z  &= 5\\
 3x + 5y + 6z &= 7\\
 2x + 4y + 3z &= 8.
-\end{align}
+\end{aligned}
 $$
 
 Mathematicians figured out that by representing these linear systems of equations using matrices and vectors, predefined algorithms could be designed to solve any system of linear equations. A basic linear algebra class will teach some of these algorithms, such as Gaussian elimination, the Gauss-Jordan elimination, and the LU and QR decompositions. These methods are usually covered in detail in university level linear algebra courses.
@@ -88,12 +88,12 @@ Note this definition implies that the multiplication $\mathbf{A}\mathbf{B}$ is o
 So how does this definition of matrix multiplication help solve systems of equations? First, any system of equations with unknowns $x_1, \dots x_n$
 
 $$
-\begin{align}
+\begin{aligned}
 a_{11} x_1 + a_{12} x_2 \dots + a_{1n}x_n &= b_1\\
 a_{21} x_1 + a_{22} x_2 \dots + a_{2n}x_n &= b_2\\
 \vdots\\
 a_{n1} x_1 + a_{n2} x_2 \dots + a_{nn}x_n &= b_n\\
-\end{align}
+\end{aligned}
 $$
 
 can now be represented as matrix multiplication by defining the following matrices:
@@ -149,12 +149,13 @@ The function `solve` works well when dealing with small to medium-sized matrices
 
 ## The identity matrix
 
-FIX The identity matrix, represented with a bold $\mathbf{I}$, is like the number 1, but for matrices: if you multiply a matrix by the identity matrix, you get back the matrix.
+The identity matrix, represented with a bold $\mathbf{I}$, is like the number 1, but for matrices: if you multiply a matrix by the identity matrix, you get back the matrix.
 
-FIX:
 $$
-\mathbf{I}\mathbf{x} = \mathbf{x}
-$$ If you do some math with the definition of matrix multiplication, you will realize that $\mathbf{1}$ is a matrix with the same number of rows and columns (referred to as square matrix) with 0s everywhere except the diagonal:
+\mathbf{I}\mathbf{X} = \mathbf{X}
+$$ 
+
+If you define $\mathbf{I}$ as matrix with the same number of rows and columns (referred to as square matrix) with 0s everywhere except the diagonal:
 
 $$
 \mathbf{I}=\begin{pmatrix}
@@ -163,7 +164,11 @@ $$
 \vdots&\vdots&\ddots&\vdots\\
 0&0&\dots&1
 \end{pmatrix}
-$$ It also implies that, due to the definition of an inverse matrix, we have:
+$$ 
+
+you will obtain the desired property. 
+
+Note that the definition of an inverse matrix implies that:
 
 $$
 \mathbf{A}^{-1}\mathbf{A} = \mathbf{1}
@@ -290,7 +295,7 @@ There are several machine learning related functions in R that take objects of c
 d
 ```
 
-FIX [alternate for sent that follows: The ?? function allows us to quickly see an image of distances between observations.] We can quickly see an image of the distances between observations using this function. As an example, we compute the distance between each of the first 300 observations and then make an image:
+The `image` function allows us to quickly see an image of distances between observations. As an example, we compute the distance between each of the first 300 observations and then make an image:
 
 ```{r distance-image, fig.width = 4, fig.height = 4, eval=FALSE}
 d <- dist(x[1:300,])
@@ -319,7 +324,7 @@ image(as.matrix(d)[order(y[1:300]), order(y[1:300])])
 
 We can think of all predictors $(x_{i,1}, \dots, x_{i,p})^\top$ for all observations $i=1,\dots,n$ as $n$ $p$-dimensional points. A *space* can be thought of as the collection of all possible points that should be considered for the data analysis in question. This includes points we could see, but have not been observed yet. In the case of the handwritten digits, we can think of the predictor space as any point $(x_{1}, \dots, x_{p})^\top$ as long as each entry $x_i, \, i = 1, \dots, p$ is between 0 and 255.
 
-FIX Some Machine Learning algorithms also define subspaces. A common approach is to define neighborhoods of points that are close to a *center*. We can do this by selecting a center $\mathbf{x}_0$, a minimum distance $r$, and defining the subspace as the collection of points $\mathbf{x}$ that satisfy:
+Some Machine Learning algorithms also define subspaces. A commonly defined subspace in machine learning are _neighborhoods_ composed of points that are close to a predetermined *center*. We do this by selecting a center $\mathbf{x}_0$, a minimum distance $r$, and defining the subspace as the collection of points $\mathbf{x}$ that satisfy:
 
 $$
 || \mathbf{x} - \mathbf{x}_0 || \leq r.
@@ -347,12 +352,12 @@ for(i in 1:m){
 2\. Solve the following system of equations using R:
 
 $$
-\begin{align}
+\begin{aligned}
 x + y + z + w &= 10\\
 2x + 3y - z - w &= 5\\
 3x - y + 4z - 2w &= 15\\
 2x + 2y - 2z - 2w &= 20\\
-\end{align}
+\end{aligned}
 $$
 
 3\. Define `x`:
diff --git a/highdim/matrices-in-R.qmd b/highdim/matrices-in-R.qmd
index 3a9a13f..31582f9 100644
--- a/highdim/matrices-in-R.qmd
+++ b/highdim/matrices-in-R.qmd
@@ -6,11 +6,11 @@ In linear algebra, we have three types of objects: scalars, vectors, and matrice
 
 ## Case study: MNIST {#sec-mnist}
 
-FIX An example comes from handwritten digits. The first step in handling mail received in the post office is to sort letters by zip code:
+The first step in handling mail received in the post office is to sort letters by zip code:
 
 ![](../ml/img/how-to-write-a-address-on-an-envelope-how-to-write-the-address-on-an-envelope-write-address-on-envelope-india-finishedenvelope-x69070.png){width="40%" style="display:block; margin:auto;"}
 
-In the Machine Learning part of this book, we will describe how we can build computer algorithms to read handwritten digits, which robots then use to sort the letters. To do this, we first need to collect data, which in this case is a high-dimensional dataset.
+In the Machine Learning part of this book, we will describe how we can build computer algorithms to read handwritten digits, which robots then use to sort the letters. To do this, we first need to collect data, which in this case is a high-dimensional dataset and best stored in a matrix.
 
 The MNIST dataset was generated by digitizing thousands of handwritten digits, already read and annotated by humans[^matrices-in-r-1]. Below are three images of written digits.
 
@@ -159,14 +159,13 @@ matrix(1:3, 3, 5)
 
 ## Subsetting 
 
-FIX To extract a specific entry from a matrix, for example the 300th row and 100th column, we use write:
+To extract a specific entry from a matrix, for example the 300th row of the 100th column, we write:
 
 ```{r, eval=FALSE}
 x[300,100]
 ```
 
-We can extract subsets of the matrices by using vectors of indexes. FIX For example, we can extract the first 100 pixels from the first 300 observations like this:
-and rows like this:
+We can extract subsets of the matrices by using vectors of indexes. For example, we can extract the first 100 pixels from the first 300 observations like this:
 
 ```{r, eval=FALSE}
 x[1:300,1:100]
@@ -213,19 +212,22 @@ The third row of the matrix `x[3,]` contains the 784 pixel intensities. We can a
 grid <- matrix(x[3,], 28, 28)
 ```
 
-FIX To visualize the data, we can use `image`, which shows an image of its third argument, with the first two arguments to determine the position on the x and y axes, respectively. Because the top of this plot is pixel 1, which is shown at the bottom, the image is flipped. FIX To code below includes code showing how to flip it back:
+To visualize the data, we can use `image` in the followin way:
 
 ```{r, eval=FALSE}
 image(1:28, 1:28, grid)
-image(1:28, 1:28, grid[, 28:1])
 ```
 
-```{r matrix-image, fig.width = 8, fig.height = 4, echo=FALSE}
-rafalib::mypar(1,2)
-image(1:28, 1:28, grid)
+However, because the y-axis in `image` goes bottom to top and `x` stores pixels top to bottom the code above shows shows a flipped image. To flip it back we can use:
+
+```{r, eval=FALSE}
 image(1:28, 1:28, grid[, 28:1])
 ```
 
+```{r matrix-image, fig.width = 4, fig.height = 4, echo=FALSE}
+rafalib::mypar(1,1)
+image(1:28, 1:28, grid[, 28:1])
+```
 
 ## Mathematical notation {#sec-matrix-notation}
 
@@ -275,7 +277,7 @@ x_{i,p}
 $$
 
 ::: {callout-warning}
-Bold lower case letters are also commonly used to represent matrix columns rather than rows. This can be confusing because $\mathbf{x}_1$ can represent either the first row or the first column of $\mathbf{X}$. One way to distinguish is to use notation similar to computer code: using the colon $:$ to represent *all*. FIX So $\mathbf{X}_{1,:}$ is a row, the first row and all the columns, and $\mathbf{X}_{:,1}$ is a column, the first column and all the rows. Another approach is to distinguish by the letter used to index, with $i$ used for rows and $j$ used for columns. So $\mathbf{x}_i$ is the $i$th row and $\mathbf{x}_j$ is the $j$th column. With this approach, it is important to clarify which dimension, row or column is being represented. Further confusion can arise because, as aforementioned, it is common to represent all vectors as one column matrices, including the rows of a matrix.
+Bold lower case letters are also commonly used to represent matrix columns rather than rows. This can be confusing because $\mathbf{x}_1$ can represent either the first row or the first column of $\mathbf{X}$. One way to distinguish is to use notation similar to computer code: using the colon $:$ to represent *all*. So $\mathbf{X}_{1,:}$ represents the first row and $\mathbf{X}_{:,1}$ is the first column. Another approach is to distinguish by the letter used to index, with $i$ used for rows and $j$ used for columns. So $\mathbf{x}_i$ is the $i$th row and $\mathbf{x}_j$ is the $j$th column. With this approach, it is important to clarify which dimension, row or column is being represented. Further confusion can arise because, as aforementioned, it is common to represent all vectors, including the rows of a matrix, as one-column matrices.
 :::
 
 ## The transpose
@@ -377,7 +379,7 @@ Note that logical filters can be used to subset matrices in a similar way in whi
 matrix(1:15, 3, 5)[,c(FALSE, TRUE, TRUE, FALSE, TRUE)]
 ```
 
-FIX This implies that we can select rows with conditional expression. The following is practical example that removes all observations containing at least one `NA`:
+This implies that we can select rows with conditional expression. In the following example we remove all observations containing at least one `NA`:
 
 ```{r}
 #| eval: false
@@ -439,9 +441,7 @@ Only the columns for which the standard deviation is above 60 are kept, which re
 
 ## Indexing with matrices
 
-FIX A operation that facilitates efficient coding is that we can change entries of a matrix based on conditionals applied to that same matrix. FIX Here is a simple example:
-
-To see what this does, we look at a smaller matrix:
+An operation that facilitates efficient coding is that we can change entries of a matrix based on conditionals applied to that same matrix. Here is a simple example:
 
 ```{r}
 mat <- matrix(1:15, 3, 5)
diff --git a/highdim/matrix-factorization.qmd b/highdim/matrix-factorization.qmd
index 519f0fe..6b84ffc 100644
--- a/highdim/matrix-factorization.qmd
+++ b/highdim/matrix-factorization.qmd
@@ -145,13 +145,13 @@ p <- t(qr.solve(crossprod(q)) %*% t(q) %*% t(e))
 ```
 :::
 
-FIX We can see that there type of users:
+The histogram below shows there are three type of users: those that love mob movies and hate romance movies, those that don't care, and those that love romance movies and hate mob movies. 
 
 ```{r}
 hist(p, breaks = seq(-2,2,0.1))
 ```
 
-love mob and hate romance movies, don't care, and love romance and hate mob movies. We see that we approximate $\varepsilon_{i,j}$ with $p_iq_j$. We convert the vectors to matrices to be able to use linear algebra:
+To see that we can approximate $\varepsilon_{i,j}$ with $p_iq_j we convert the vectors to matrices and use linear algebra:
 
 ```{r}
 p <- matrix(p); q <- matrix(q)
@@ -171,13 +171,13 @@ q <- cbind(c(-1, -1, -1, 1, 1, 1),
            c(1, 1, -1, 1, -1, 1))
 ```
 
-And obtain estimates for each user:
+We can then obtain estimates for each user:
 
 ```{r}
-p <- t(apply(e, 1, function(y) lm(y~q-1)$coef))
+p <- t(apply(e, 1, function(y) lm(y~q-1)$coefficient))
 ```
 
-We use the transpose because `apply` binds results into columns and we want a row for each user. 
+Note that we use the transpose `t` because `apply` binds results into columns and we want a row for each user. 
 
 Our approximation based on two factors does a even better job of predicting how our residuals deviate from 0:
 
@@ -185,7 +185,7 @@ Our approximation based on two factors does a even better job of predicting how
 plot(p %*% t(q), e)
 ```
 
-This analysis provides insights into the process generating our data. Note that it also provides compression: the $\120 \times 6$ matrix $\boldsymbol{\varepsilon}$, with 720 observation, is well approximated by a matrix multiplication of a $120 \times 2$ matrix $\mathbf{P}$ and a $6 \times 2$ matrix $\mathbf{Q}$, a total of 252 parameters. 
+This analysis provides insights into the process generating our data. Note that it also provides compression: the $120 \times 6$ matrix $\boldsymbol{\varepsilon}$, with 720 observation, is well approximated by a matrix multiplication of a $120 \times 2$ matrix $\mathbf{P}$ and a $6 \times 2$ matrix $\mathbf{Q}$, a total of 252 parameters. 
 
 Our approximation with two factors can be written as:
 
@@ -251,7 +251,7 @@ $$
 $$
 
 
-Notice this model is not identifiable since we can multiply the $\mathbf{P}$ by any positive constant and obtain the same model by dividing $\mathbf{Q}$ by this same constant. FIX (constrainT?) To avoid this, we impose the constrain that the matrix $\mathbf{Q}$ is orthogonal:
+Notice this model is not identifiable since we can multiply the $\mathbf{P}$ by any positive constant and obtain the same model by dividing $\mathbf{Q}$ by this same constant.  To avoid this, we impose the constraint that $\mathbf{Q}$ is orthogonal:
 
 $$
 \mathbf{Q}^\top\mathbf{Q} = \mathbf{I}
@@ -285,13 +285,12 @@ Y_{i,j} = \mu + \alpha_i + \beta_j + \sum_{k=1}^K p_{i,k}q_{j,k} +\varepsilon_{i
 $$
 
 
-Unfortunately, we can't fit this model with `prcomp` due to the missing values. We introduce the **missMDA** package that provides an approach `imputePCA` to fit such models when matrix entries are missing, a very common occurrence in movie recommendations. Also, because there are small sample sizes for several movie pairs, it is useful to regularize the $p$s. The `imputePCA` function also permits regularization.
+Unfortunately, we can't fit this model with `prcomp` due to the missing values. We introduce the **missMDA** package that provides an approach to fit such models when matrix entries are missing, a very common occurrence in movie recommendations, through the function `imputePCA`. Also, because there are small sample sizes for several movie pairs, it is useful to regularize the $p$s. The `imputePCA` function also permits regularization.
 
-FIX We estimate use the estimates for $\mu$, the $\alpha$s and the $\beta$s from the previous chapter and estimate two factors (`ncp = 2`) to movies rated more than 25 times (and _Scent of a Woman_), using the same penalty we used for the $\beta$s (`coeff.ridge = `) for the penalization.
+We use the estimates for $\mu$, the $\alpha$s and $\beta$s from the previous chapter, and estimate two factors (`ncp = 2`). We fit the model to movies rated more than 25 times, include _Scent of a Woman_, which does not meet this criterion, because we previously used it as an example. Finally, we use regularization by setting the parameter `coeff.ridge` to the same value used to estimate the $\beta$s.
 
 ```{r}
 library(missMDA)
-
 ind <- colSums(!is.na(y)) >= 25 | colnames(y) == "3252"
 imputed <- imputePCA(r[,ind], ncp = 2, coeff.ridge = lambda)
 ```
@@ -326,7 +325,7 @@ We see that our prediction improves:
 rmse(y_test - pred[rownames(y_test), colnames(y_test)])
 ```
 
-We note that further improvements can be obtained by optimizing the regularization penalty, considering more factors, FIX and accounting for the fact that missing that is likely more predictive of not liking the movie.
+We note that further improvements can be obtained by 1) optimizing the regularization penalty, 2) considering more than 2 factors, and 3) accounting for the fact that a missing rating provides information: people tend to not watch movies they know they won't like.
 
 ### Visualizing factors
 
@@ -388,7 +387,7 @@ $$
 \mathbf{U}^\top\mathbf{D}\mathbf{U} = \mathbf{D}^2
 $$
 
-FIX In, we can obtain the SVD using the function `svd`. To see the connection to PCA, notice that:
+In R, we can obtain the SVD using the function `svd`. To see the connection to PCA, notice that:
 
 ```{r}
 x <- matrix(rnorm(1000), 100, 10)
@@ -499,10 +498,11 @@ plot `ss_y` against the column number and then do the same for `ss_yv`. What do
     
 7\. We see that almost 99% of the variability is explained by the first three columns of $\mathbf{YV}  = \mathbf{UD}$. So we get the sense that we should be able to explain much of the variability and structure we found while exploring the data with a few columns. Before we continue, let's show a useful computational trick to avoid creating the matrix `diag(s$d)`. To motivate this, we note that if we write $\mathbf{U}$ out in its columns $[\mathbf{u}_1, \mathbf{u}_2, \dots, \mathbf{u}_p]$, then $\mathbf{UD}$ is equal to:
 
-$$\mathbf{UD} = [\mathbf{u}_1 d_{1,1}, \mathbf{u}_2 d_{2,2}, \dots, \mathbf{u}_p d_{p,p}]$$
-
-FIX Use the `sweep` function to compute $UD$ without constructing `diag(s$d)` nor matrix multiplication.
+$$
+\mathbf{UD} = [\mathbf{u}_1 d_{1,1}, \mathbf{u}_2 d_{2,2}, \dots, \mathbf{u}_p d_{p,p}]
+$$
 
+Use the `sweep` function to compute $UD$ without constructing `diag(s$d)` and without using matrix multiplication.
 
 
 8\. We know that $\mathbf{u}_1 d_{1,1}$, the first column of $\mathbf{UD}$, has the most variability of all the columns of $\mathbf{UD}$. Earlier we saw an image of $Y$: 
@@ -528,7 +528,7 @@ This implies that multiplying $\mathbf{Y}$ by the first column of $\mathbf{V}$ m
 10\. We already saw that we can rewrite $UD$ as:
 
 $$
-\mathbf{u}_1 d_{1,1} + \mathbf{u}_2 d_{2,2} + \dots + \mathbf{u}_p d_{p,p}$
+\mathbf{u}_1 d_{1,1} + \mathbf{u}_2 d_{2,2} + \dots + \mathbf{u}_p d_{p,p}
 $$
 
 with $\mathbf{u}_j$ the j-th column of $\mathbf{U}$. This implies that we can rewrite the entire SVD as:
@@ -541,7 +541,9 @@ with $\mathbf{V}_j$ the jth column of $\mathbf{V}$. Plot $\mathbf{u}_1$, then pl
 
 11\. We see that with just a vector of length 100, a scalar, and a vector of length 24, we actually come close to reconstructing the original $100 \times 24$ matrix. This is our first matrix factorization:
 
-$$ \mathbf{Y} \approx d_{1,1} \mathbf{u}_1 \mathbf{v}_1^{\top}$$
+$$ 
+\mathbf{Y} \approx d_{1,1} \mathbf{u}_1 \mathbf{v}_1^{\top}
+$$
 We know it explains `s$d[1]^2/sum(s$d^2) * 100` percent of the total variability. Our approximation only explains the observation that good students tend to be good in all subjects. But another aspect of the original data that our approximation does not explain was the higher similarity we observed within subjects. We can see this by computing the difference between our approximation and original data and then computing the correlations. You can see this by running this code:
 
 ```{r, eval=FALSE}
@@ -555,7 +557,9 @@ Now that we have removed the overall student effect, the correlation plot reveal
 
 12\. The second column clearly relates to a student's difference in ability in math/science versus the arts. We can see this most clearly from the plot of `s$v[,2]`. Adding the matrix we obtain with these two columns will help with our approximation:
 
-$$ \mathbf{Y} \approx d_{1,1} \mathbf{u}_1 \mathbf{v}_1^{\top} + d_{2,2} \mathbf{u}_2 \mathbf{v}_2^{\top} $$
+$$
+\mathbf{Y} \approx d_{1,1} \mathbf{u}_1 \mathbf{v}_1^{\top} + d_{2,2} \mathbf{u}_2 \mathbf{v}_2^{\top}
+$$
 
 We know it will explain:
 
@@ -577,7 +581,9 @@ and see that the structure that is left is driven by the differences between mat
 13\. The third column clearly relates to a student's difference in ability in math and science. We can see this most clearly from the plot of `s$v[,3]`. Adding the matrix we obtain with these two columns will help with our approximation:
 
 
-$$ \mathbf{Y} \approx d_{1,1} \mathbf{u}_1 \mathbf{v}_1^{\top} + d_{2,2} \mathbf{u}_2 \mathbf{v}_2^{\top} + d_{3,3} \mathbf{u}_3 \mathbf{v}_3^{\top}$$
+$$
+\mathbf{Y} \approx d_{1,1} \mathbf{u}_1 \mathbf{v}_1^{\top} + d_{2,2} \mathbf{u}_2 \mathbf{v}_2^{\top} + d_{3,3} \mathbf{u}_3 \mathbf{v}_3^{\top}
+$$
 
 We know it will explain:
 
@@ -595,7 +601,9 @@ axis(side = 2, 1:ncol(y), rev(colnames(y)), las = 2)
 
 We no longer see structure in the residuals: they seem to be independent of each other. This implies that we can describe the data with the following model:
 
-$$ \mathbf{Y} =  \mathbf{u}_1 \mathbf{v}_1^{\top} + d_{2,2} \mathbf{u}_2 \mathbf{v}_2^{\top} + d_{3,3} \mathbf{u}_3 \mathbf{v}_3^{\top} + \varepsilon$$
+$$
+\mathbf{Y} =  \mathbf{u}_1 \mathbf{v}_1^{\top} + d_{2,2} \mathbf{u}_2 \mathbf{v}_2^{\top} + d_{3,3} \mathbf{u}_3 \mathbf{v}_3^{\top} + \varepsilon
+$$
 
 with $\varepsilon$ a matrix of independent identically distributed errors. This model is useful because we summarize $100 \times 24$ observations with $3 \times (100+24+1) = 375$ numbers. Furthermore, the three components of the model have useful interpretations: 1) the overall ability of a student, 2) the difference in ability between the math/sciences and arts, and 3) the remaining differences between the three subjects. The sizes $d_{1,1}, d_{2,2}$ and $d_{3,3}$ tell us the variability explained by each component. Finally, note that the components $d_{j,j} \mathbf{u}_j \mathbf{v}_j^{\top}$ are equivalent to the jth principal component.
 
diff --git a/highdim/regularization.qmd b/highdim/regularization.qmd
index d5c8636..c16daf2 100644
--- a/highdim/regularization.qmd
+++ b/highdim/regularization.qmd
@@ -140,7 +140,7 @@ y <- select(train_set, movieId, userId, rating) |>
   as.matrix()
 ```
 
-FIX We be able to map movie ids to titles we create a lookup table:
+To be able to map movie IDs to titles we create the following lookup table:
 
 ```{r}
 movie_map <- train_set |> select(movieId, title) |> distinct(movieId, .keep_all = TRUE)
diff --git a/inference/bayes.qmd b/inference/bayes.qmd
index dd48376..dd386aa 100644
--- a/inference/bayes.qmd
+++ b/inference/bayes.qmd
@@ -51,17 +51,17 @@ tmp_3 <- map_df(1:3, function(i){
 tmp_2 |> 
   ggplot(aes(x, ymax = y, ymin = 0)) +
   geom_ribbon(fill = "grey") + 
-  facet_grid(candidate~., switch="y") +
+  facet_grid(candidate ~ ., switch = "y") +
   scale_x_continuous(breaks = seq(0, 75, 25), position = "top",
-                     label= paste0(seq(0, 75, 25), "%")) +
+                     labels = paste0(seq(0, 75, 25), "%")) +
   geom_abline(intercept = 0, slope = 0) +
   xlab("") + ylab("") + 
   theme_minimal() + 
   theme(panel.grid.major.y = element_blank(), 
         panel.grid.minor.y = element_blank(),
-        axis.title.y=element_blank(),
-        axis.text.y=element_blank(),
-        axis.ticks.y=element_blank(),
+        axis.title.y = element_blank(),
+        axis.text.y = element_blank(),
+        axis.ticks.y = element_blank(),
         strip.text.y = element_text(angle = 180, size = 11, vjust = 0.2)) + 
   geom_ribbon(data = tmp_3, mapping = aes(x = x, ymax = y, ymin = 0, fill = candidate), inherit.aes = FALSE, show.legend = FALSE) +
   scale_fill_manual(values = c("#3cace4", "#fc5c34", "#fccc2c")) +
@@ -71,7 +71,7 @@ tmp_2 |>
   
 ```
 
-But what does this mean in the context of the theory we have previously covered, in which these percentages are considered fixed? Furthermore, election forecasters make probabilistic statements such "Obama has a 90% chance of winning the election." Note that in the context of an urn model, this would be equivalent to stating that the probability $p>0.5$ is 90%. However, the urn model $p$ is a fixed parameter and it does not make sense to talk about probability. With Bayesian statistics, we model $p$ as FIX AS A RANDOM? random variable, and thus, a statement such as "90% chance of winning" is consistent with the mathematical approach. Forecasters also use models to describe variability at different levels. For example, sampling variability, pollster to pollster variability, day to day variability, and election to election variability. One of the most successful approaches used for this are hierarchical models, which can be explained in the context of Bayesian statistics.
+But what does this mean in the context of the theory we have previously covered, in which these percentages are considered fixed? Furthermore, election forecasters make probabilistic statements such "Obama has a 90% chance of winning the election." Note that in the context of an urn model, this would be equivalent to stating that the probability $p>0.5$ is 90%. However, the urn model $p$ is a fixed parameter and it does not make sense to talk about probability. With Bayesian statistics, we assume $p$ is random variable, and thus, a statement such as "90% chance of winning" is consistent with the mathematical approach. Forecasters also use models to describe variability at different levels. For example, sampling variability, pollster to pollster variability, day to day variability, and election to election variability. One of the most successful approaches used for this are hierarchical models, which can be explained in the context of Bayesian statistics.
 
 ::: {.callout-note}
 The approach described in the previous chapters, where the parameters are considered fixed, is often referred to as **frequentist**.
@@ -161,7 +161,7 @@ From this table, we see that the proportion of positive tests that have the dise
 
 ## Priors, posteriors and and credible intervals 
 
-FIX In the previous chapter, we an estimate and margin of error for the difference in popular votes between Hillary Clinton and Donald Trump, which we denoted with $\mu$. The estimate was between 2 and 3 percent, and the confidence interval did not include 0. A forecaster would use this to predict Hillary Clinton would win the popular vote. FIX BAYESIAN ALGO? But to make a probabilistic statement about winning the election, we need to use a Bayesian.
+In the previous chapter, we computed an estimate and margin of error for the difference in popular votes between Hillary Clinton and Donald Trump. We denoted the parameter, the  the difference in popular votes, with $\mu$. The estimate was between 2 and 3 percent, and the confidence interval did not include 0. A forecaster would use this to predict Hillary Clinton would win the popular vote. But to make a probabilistic statement about winning the election, we need to use a Bayesian approach.
 
 We start the Bayesian approach by quantifying our knowledge _before_ seeing any data. This is done using a probability distribution referred to as a _prior_. For our example, we could write:
 
@@ -280,7 +280,9 @@ b.  He made two mistakes. First, he misused the multiplication rule and did not
 c.  He mixed up the numerator and denominator of Bayes' rule.
 d.  He did not use R.
 
-6\. FIX I BROKE INTO TWO SENT AND CHANGED LANGUAGE A BIT Florida is one of the most closely watched states in U.S. elections because it has many electoral votes. Florida tends to be a swing state that can vote either way and can therefore significantly affect a close election. Create the following table with the polls taken during the last two weeks:
+6\. Florida is one of the most closely watched states in U.S. elections because it has many electoral votes. In past elections, Florida was a swing state where both Republicans and Democrats won implying it could affect a close elections. 
+
+Create the following table with the polls taken during the last two weeks:
 
 ```{r, eval=FALSE}
 library(tidyverse)
@@ -292,7 +294,7 @@ polls <- polls_us_election_2016 |>
 
 Take the average spread of these polls. The CLT tells us this average is approximately normal. Calculate an average and provide an estimate of the standard error. Save your results in an object called `results`.
 
-7\. FIX NORMAL CAPITALIZED? Now assume a Bayesian model that sets the prior distribution for Florida's election night spread $\mu$ to be Normal with expected value $\theta$ and standard deviation $\tau$. What are the interpretations of $\theta$ and $\tau$?
+7\. Now assume a Bayesian model that sets the prior distribution for Florida's election night spread $\mu$ to follow a normal distribution with expected value $\theta$ and standard deviation $\tau$. What are the interpretations of $\theta$ and $\tau$?
 
 a.  $\theta$ and $\tau$ are arbitrary numbers that let us make probability statements about $\mu$.
 b.  $\theta$ and $\tau$ summarize what we would predict for Florida before seeing any polls. Based on past elections, we would set $\mu$ close to 0, because both Republicans and Democrats have won, and $\tau$ at about $0.02$, because these elections tend to be close.
diff --git a/inference/bootstrap.qmd b/inference/bootstrap.qmd
new file mode 100644
index 0000000..bad62e2
--- /dev/null
+++ b/inference/bootstrap.qmd
@@ -0,0 +1,116 @@
+# Bootstrap {#sec-bootstrap}
+
+CLT provides an useful approach to building confidence intervals and performing hypothesis testing. However, it does not always apply. Here we provide a short introduction to an alternative approach to estimating the distribution of an estimate that does not rely on CLT. 
+
+## Example: median income 
+
+Suppose the income distribution of your population is as follows:
+
+```{r income-distribution}
+set.seed(1995)
+n <- 10^6
+income <- 10^(rnorm(n, log10(45000), log10(3)))
+hist(income/10^3, nclass = 1000)
+```
+
+The population median is:
+
+```{r}
+m <- median(income)
+m
+```
+
+Suppose we don't have access to the entire population, but want to estimate the median $m$. We take a sample of 100 and estimate the population median $m$ with the sample median $M$:
+
+```{r}
+N <- 100
+x <- sample(income, N)
+median(x)
+```
+
+## Confidence intervals for the median
+
+Can we construct a confidence interval? What is the distribution of $M$ ?
+
+Because we are simulating the data, we can use a Monte Carlo simulation to learn the distribution of $M$.
+
+```{r, eval=FALSE}
+library(gridExtra)
+B <- 10^4
+m <- replicate(B, {
+  x <- sample(income, N)
+  median(x)
+})
+hist(m, nclass = 30)
+qqnorm(scale(m)); abline(0,1)
+```
+
+```{r median-is-normal, message = FALSE, warning = FALSE, out.width="100%", fig.width = 6, fig.height = 3, echo=FALSE}
+B <- 10^4
+m <- replicate(B, {
+  x <- sample(income, N)
+  median(x)
+})
+rafalib::mypar(1,2)
+hist(m, nclass = 30)
+qqnorm(scale(m)); abline(0,1)
+```
+
+If we know this distribution, we can construct a confidence interval. The problem here is that, as we have already described, in practice we do not have access to the distribution. In the past, we have used the Central Limit Theorem, but the CLT we studied applies to averages and here we are interested in the median. We can see that the 95% confidence interval based on CLT
+
+```{r}
+median(x) + 1.96*sd(x)/sqrt(N)*c(-1, 1)
+```
+
+is quite different from the confidence interval we would generate if we know the actual distribution of $M$:
+
+```{r}
+quantile(m, c(0.025, 0.975))
+```
+
+The bootstrap permits us to approximate a Monte Carlo simulation without access to the entire distribution. The general idea is relatively simple. We act as if the observed sample is the population. We then sample (with replacement) datasets, of the same sample size as the original dataset. Then we compute the summary statistic, in this case the median, on these *bootstrap samples*.
+
+Theory tells us that, in many situations, the distribution of the statistics obtained with bootstrap samples approximate the distribution of our actual statistic. This is how we construct bootstrap samples and an approximate distribution:
+
+```{r}
+B <- 10^4
+m_star <- replicate(B, {
+  x_star <- sample(x, N, replace = TRUE)
+  median(x_star)
+})
+```
+
+Note a confidence interval constructed with the bootstrap is much closer to one constructed with the theoretical distribution:
+
+```{r}
+quantile(m_star, c(0.025, 0.975))
+```
+
+For more on the Bootstrap, including corrections one can apply to improve these confidence intervals, please consult the book *An introduction to the bootstrap* by Efron, B., & Tibshirani, R. J.
+
+## Exercises
+
+1\. Generate a random dataset like this:
+
+```{r, eval = FALSE}
+y <- rnorm(100, 0, 1)
+```
+
+Estimate the 75th quantile, which we know is:
+
+```{r, eval = FALSE}
+qnorm(0.75)
+```
+
+with the sample quantile:
+
+```{r, eval = FALSE}
+quantile(y, 0.75)
+```
+
+Run a Monte Carlo simulation to learn the expected value and standard error of this random variable.
+
+2\. In practice, we can't run a Monte Carlo simulation because we don't know if `rnorm` is being used to simulate the data. Use the bootstrap to estimate the standard error using just the initial sample `y`. Use 10 bootstrap samples.
+
+3\. Redo exercise 12, but with 10,000 bootstrap samples.
+
diff --git a/inference/clt.qmd b/inference/clt.qmd
index f87f660..1b22583 100644
--- a/inference/clt.qmd
+++ b/inference/clt.qmd
@@ -81,7 +81,7 @@ Suppose we want to use a Monte Carlo simulation to corroborate the tools we have
 B <- 10000
 N <- 1000
 x_hat <- replicate(B, {
-  x <- sample(c(0,1), size = N, replace = TRUE, prob = c(1-p, p))
+  x <- sample(c(0,1), size = N, replace = TRUE, prob = c(1 - p, p))
   mean(x)
 })
 ```
@@ -120,16 +120,16 @@ A histogram and qqplot confirm that the normal approximation is also accurate:
 ```{r normal-approximation-for-polls, echo=FALSE, warning=FALSE, message=FALSE, out.width="100%", fig.height=3, cache=FALSE}
 library(tidyverse)
 library(gridExtra)
-p1 <- data.frame(x_hat=x_hat) |> 
+p1 <- data.frame(x_hat = x_hat) |> 
   ggplot(aes(x_hat)) + 
-  geom_histogram(binwidth = 0.005, color="black")
-p2 <-  data.frame(x_hat=x_hat) |> 
-  ggplot(aes(sample=x_hat)) + 
-  stat_qq(dparams = list(mean=mean(x_hat), sd=sd(x_hat))) +
+  geom_histogram(binwidth = 0.005, color = "black")
+p2 <-  data.frame(x_hat = x_hat) |> 
+  ggplot(aes(sample = x_hat)) + 
+  stat_qq(dparams = list(mean = mean(x_hat), sd = sd(x_hat))) +
   geom_abline() + 
   ylab("x_hat") + 
   xlab("Theoretical normal")
-grid.arrange(p1,p2, nrow=1)
+grid.arrange(p1, p2, nrow = 1)
 ```
 
 Of course, in real life, we would never be able to run such an experiment because we don't know $p$. However, we can run it for various values of $p$ and $N$ and see that the theory does indeed work well for most values. You can easily do this by rerunning the code above after changing the values of `p` and `N`.
@@ -151,11 +151,11 @@ For realistic values of $p$, let's say ranging from 0.35 to 0.65, if we conduct
 ```{r standard-error-versus-p, echo=FALSE}
 N <- 100000
 p <- seq(0.35, 0.65, length = 100)
-SE <- sapply(p, function(x) 2*sqrt(x*(1-x)/N))
+SE <- sapply(p, function(x) 2*sqrt(x*(1 - x)/N))
 qplot(p, SE, geom = "line")
 ```
 
-One reason is that conducting such a poll is very expensive. Another, and possibly more important reason, is that theory has its limitations. Polling is much more complicated than simply picking beads from an urn. Some people might lie to pollsters, and others might not have phones. However, perhaps the most important way an actual poll differs from an urn model is that we don't actually know for sure who is in our population and who is not. How do we know who is going to vote? Are we reaching all possible voters? Hence, even if our margin of error is very small, it might not be exactly right that our expected value is $p$. We call this bias. FIX CHANGED THE END FROM "THAT MUCH" Historically, we observe that polls are indeed biased, although not by a significant margin. The typical bias appears to be about 1-2%. This makes election forecasting a bit more interesting, and we will explore how to model this in a later section.
+One reason is that conducting such a poll is very expensive. Another, and possibly more important reason, is that theory has its limitations. Polling is much more complicated than simply picking beads from an urn. Some people might lie to pollsters, and others might not have phones. However, perhaps the most important way an actual poll differs from an urn model is that we don't actually know for sure who is in our population and who is not. How do we know who is going to vote? Are we reaching all possible voters? Hence, even if our margin of error is very small, it might not be exactly right that our expected value is $p$. We call this bias. Historically, we observe that polls are indeed biased, although not by a substantial amount. The typical bias appears to be about 1-2%. This makes election forecasting a bit more interesting, and we will explore how to model this in a later section.
 
 ## Exercises
 
@@ -177,7 +177,7 @@ d.  The errors range from -1 to 1.
 
 4\. The error $\bar{X}-p$ is a random variable. In practice, the error is not observed because we do not know $p$. Here, we observe it since we constructed the simulation. What is the average size of the error if we define the size by taking the absolute value $\mid \bar{X} - p \mid$?
 
-5\. The standard error is related to the typical **size** of the error we make when predicting. FIX We say **size** because we just saw that the errors are centered around 0, so thus the average error value is 0. For mathematical reasons related to the Central Limit Theorem, we actually use the standard deviation of `errors`, rather than the average of the absolute values, to quantify the typical size. What is this standard deviation of the errors?
+5\. The standard error is related to the typical **size** of the error we make when predicting. For mathematical reasons related to the Central Limit Theorem, we actually use the standard deviation of `errors`, rather than the average of the absolute values, to quantify the typical size. What is this standard deviation of the errors?
 
 6\. The theory we just learned tells us what this standard deviation is going to be because it is the standard error of $\bar{X}$. What does theory tell us is the standard error of $\bar{X}$ for a sample size of 100?
 
diff --git a/inference/confidence-intervals.qmd b/inference/confidence-intervals.qmd
index bbe60e6..dd91556 100644
--- a/inference/confidence-intervals.qmd
+++ b/inference/confidence-intervals.qmd
@@ -4,7 +4,7 @@ Confidence intervals are a very useful concept widely employed by data analysts.
 
 ```{r first-confidence-intervals-example, warning=FALSE, message=FALSE, echo=FALSE, cache=FALSE}
 library(tidyverse)
-data.frame(year = as.numeric(time(nhtemp)), temperature=as.numeric(nhtemp)) |>
+data.frame(year = as.numeric(time(nhtemp)), temperature = as.numeric(nhtemp)) |>
   ggplot(aes(year, temperature)) + 
   geom_point() + 
   geom_smooth() + 
@@ -31,19 +31,19 @@ N <- 1000
 And notice that the interval here:
 
 ```{r}
-x <- sample(c(0, 1), size = N, replace = TRUE, prob = c(1-p, p))
+x <- sample(c(0, 1), size = N, replace = TRUE, prob = c(1 - p, p))
 x_hat <- mean(x)
-se_hat <- sqrt(x_hat * (1 - x_hat) / N)
-c(x_hat - 1.96 * se_hat, x_hat + 1.96 * se_hat)
+se_hat <- sqrt(x_hat*(1 - x_hat)/N)
+c(x_hat - 1.96*se_hat, x_hat + 1.96*se_hat)
 ```
 
 is different from this one:
 
 ```{r}
-x <- sample(c(0,1), size=N, replace=TRUE, prob=c(1-p, p))
+x <- sample(c(0,1), size = N, replace = TRUE, prob = c(1 - p, p))
 x_hat <- mean(x)
-se_hat <- sqrt(x_hat * (1 - x_hat) / N)
-c(x_hat - 1.96 * se_hat, x_hat + 1.96 * se_hat)
+se_hat <- sqrt(x_hat*(1 - x_hat)/N)
+c(x_hat - 1.96*se_hat, x_hat + 1.96*se_hat)
 ```
 
 Keep sampling and creating intervals, and you will see the random variation.
@@ -95,9 +95,8 @@ pnorm(z) - pnorm(-z)
 
 is `0.995 - 0.005 = 0.99`.
 
-We can use this approach for any probability, not just 0.95 and 0.99. In statistics textbooks, these are usually written for any probability as $1-\alpha$. FIX We can then obtain the $z$ for the equation above noting using `z = qnorm(1 - alpha / 2)` because $1 - \alpha/2 - \alpha/2 = 1 - \alpha$.
-
-FIX So, for example, for $\alpha=0.05$, $1 - \alpha/2 = 0.975$ and we get the 1.96 we have been using:
+We can use this approach for any probability, not just 0.95 and 0.99. In statistics textbooks, confidence interval formulas are given for arbitraty  probabilities written as $1-\alpha$. We can obtain the $z$ for the equation above using `z = qnorm(1 - alpha / 2)` because $1 - \alpha/2 - \alpha/2 = 1 - \alpha$.
+So, for example, for $\alpha=0.05$, $1 - \alpha/2 = 0.975$ and we get the $z=1.96$ we used above:
 
 ```{r}
 qnorm(0.975)
@@ -115,10 +114,10 @@ set.seed(1)
 N <- 1000
 B <- 10000
 inside <- replicate(B, {
-  x <- sample(c(0,1), size = N, replace = TRUE, prob = c(1-p, p))
+  x <- sample(c(0,1), size = N, replace = TRUE, prob = c(1 - p, p))
   x_hat <- mean(x)
-  se_hat <- sqrt(x_hat * (1 - x_hat) / N)
-  between(p, x_hat - 1.96 * se_hat, x_hat + 1.96 * se_hat)
+  se_hat <- sqrt(x_hat*(1 - x_hat)/N)
+  between(p, x_hat - 1.96*se_hat, x_hat + 1.96*se_hat)
 })
 mean(inside)
 ```
@@ -128,26 +127,25 @@ The following plot shows the first 100 confidence intervals. In this case, we cr
 ```{r confidence-interval-coverage, message=FALSE, echo=FALSE, fig.height=6}
 set.seed(1)
 tab <- replicate(100, {
-  x <- sample(c(0,1), size = N, replace = TRUE, prob = c(1-p, p))
+  x <- sample(c(0,1), size = N, replace = TRUE, prob = c(1 - p, p))
   x_hat <- mean(x)
-  se_hat <- sqrt(x_hat * (1 - x_hat) / N)
-  hit <- between(p, x_hat - 1.96 * se_hat, x_hat + 1.96 * se_hat)
-  c(x_hat, x_hat - 1.96 * se_hat, x_hat + 2 * se_hat, hit)
+  se_hat <- sqrt(x_hat*(1 - x_hat)/N)
+  hit <- between(p, x_hat - 1.96*se_hat, x_hat + 1.96*se_hat)
+  c(x_hat, x_hat - 1.96*se_hat, x_hat + 1.96*se_hat, hit)
 })
 
-tab <- data.frame(poll=1:ncol(tab), t(tab))
-names(tab)<-c("poll", "estimate", "low", "high", "hit")
-tab <- mutate(tab, p_inside = ifelse(hit, "Yes", "No") )
-ggplot(tab, aes(poll, estimate, ymin=low, ymax=high, col = p_inside)) + 
-  geom_point()+
+tab <- data.frame(poll = 1:ncol(tab), t(tab))
+names(tab) <- c("poll", "estimate", "low", "high", "hit")
+tab <- mutate(tab, p_inside = ifelse(hit, "Yes", "No"))
+ggplot(tab, aes(poll, estimate, ymin = low, ymax = high, col = p_inside)) + 
+  geom_point() +
   geom_errorbar() + 
   coord_flip() + 
   geom_hline(yintercept = p)
 ```
 
-::: {.callout-note title = "The correct language"}
-
-When applying the theory we described above, it's important to remember that it's the intervals that are random, not $p$. In the plot above, we can see the random intervals moving around, while $p$, represented with the vertical line, remains in the same place. FIX The proportion of blue in the urn $p$ is not. So the 95% relates to the probability that this random interval falls on top of $p$. Stating that $p$ has a 95% chance of being between this and that is technically incorrect because $p$ is not random.
+::: {.callout-note}
+When applying the theory we described above, it's important to remember that it's the intervals that are random, not $p$. In the plot above, we can see the random intervals moving around, while the proportion of blue beads in the urn $p$, represented with the vertical line, remains in the same place. So the 95% relates to the probability that the random interval falls on top of $p$. Stating that $p$ has a 95% chance of being between this or that is technically incorrect because $p$ is not random.
 :::
 
 
@@ -189,7 +187,7 @@ Assume there are only two candidates and construct a 95% confidence interval for
 ```{r, message=FALSE, comment=FALSE}
 polls <- polls_us_election_2016 |> 
   filter(enddate >= "2016-10-31" & state == "U.S.")  |>
-  mutate(mu_hat = rawpoll_clinton / 100 - rawpoll_trump / 100)
+  mutate(mu_hat = rawpoll_clinton/100 - rawpoll_trump/100)
 ```
 
 7\. Now repeat exercise 3, but for the difference.
diff --git a/inference/hierarchical-models.qmd b/inference/hierarchical-models.qmd
index ed1365e..38274f3 100644
--- a/inference/hierarchical-models.qmd
+++ b/inference/hierarchical-models.qmd
@@ -21,7 +21,7 @@ Since the 2008 elections, organizations other than FiveThirtyEight have started
 ```{r, echo=FALSE, out.width="100%"}
 #knitr::include_graphics(file.path(img_path, "pollster-2016-predictions.png"))
 tmp <- data.frame(NYT = "  85%", `538` = "  71%", HuffPost = "  98%", PW = "  89%", PEC = " >99%", DK = "  92%", Cook = " Lean Dem", Roth = " Lean Dem", check.names = FALSE, row.names = "Win Prob")
-if(knitr::is_html_output()){
+if (knitr::is_html_output()) {
   knitr::kable(tmp, "html") |>
     kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)
 } else{
@@ -39,7 +39,7 @@ Meanwhile, the Princeton Election Consortium (PEC) gave Trump less than 1% chanc
 So why did FiveThirtyEight's model fair so much better than others? How could PEC and Huffington Post get it so wrong if they were using the same data? In this chapter, we describe how FiveThirtyEight used a hierarchical model to correctly account for key sources of variability and outperform all other forecasters. For illustrative purposes, we will continue examining our popular vote example. In the final section, we will describe the more complex approach used to forecast the electoral college result.
 
 
-## The general bias
+## The general bias {#sec-general-bias}
 
 In the previous chapter, we computed the posterior probability of Hillary Clinton winning the popular vote with a standard Bayesian analysis and found it to be very close to 100%. However, FiveThirtyEight gave her a 81.4% chance[^models-4]. What explains this difference?
 Below, we describe the _general bias_, another source of variability, included in the FiveThirtyEight model, that accounts for the difference.
@@ -67,7 +67,7 @@ J <- 6
 N <- 2000
 mu <- .021
 p <- (mu + 1)/2
-X <- rnorm(J, mu, 2 * sqrt(p * (1 - p) / N))
+X <- rnorm(J, mu, 2*sqrt(p*(1 - p)/N))
 ```
 
 Now, suppose we have $J=6$ polls from each of $I=5$ different pollsters. For simplicity, let's say all polls had the same sample size $N$. The urn model tell us the distribution is the same for all pollsters, so to simulate data, we use the same model for each:
@@ -77,7 +77,7 @@ I <- 5
 J <- 6
 N <- 2000
 X <- sapply(1:I, function(i){
-  rnorm(J, mu, 2 * sqrt(p * (1 - p) / N))
+  rnorm(J, mu, 2*sqrt(p*(1 - p)/N))
 })
 ```
 
@@ -94,7 +94,7 @@ polls |> group_by(pollster) |>
   select(pollster, spread) |>
   mutate(type = "Observed data", pollster = as.character(pollster)) |>
   bind_rows(tibble(spread = as.vector(X) , 
-                      pollster = rep(as.character(1:I), each=J),
+                      pollster = rep(as.character(1:I), each = J),
                       type = "Simulated data")) |>
   mutate(type = factor(type, levels = c("Simulated data", "Observed data"))) |>
   ggplot(aes(pollster, spread)) + 
@@ -120,10 +120,10 @@ I <- 5
 J <- 6
 N <- 2000
 mu <- .021
-p <- (mu + 1) / 2
+p <- (mu + 1)/2
 h <- rnorm(I, 0, 0.025)
 X <- sapply(1:I, function(i){
-  mu + h[i] + rnorm(J, 0, 2 * sqrt(p * (1 - p) / N))
+  mu + h[i] + rnorm(J, 0, 2*sqrt(p*(1 - p)/N))
 })
 ```
 
@@ -139,7 +139,7 @@ data.frame(Spread = as.vector(X) , Pollster = as.factor(rep(1:I, each = J))) |>
 
 Note that $h_i$ is common to all the observed spreads from a specific pollster. Different pollsters have a different $h_i$, which explains why we can see the groups of points shift up and down from pollster to pollster. 
 
-Now, in the model above, we assume the average house effect is 0. We think that for every pollster biased in favor of our party, there is another one in favor of the other, and assume the standard deviation is $\sigma_h$. But, historically, we see that every election has a general bias affecting all polls. We can observe this with the 2016 data, but if we collect historical data, we see that the average of polls misses by more than models like the one above predict. To see this, we would take the average of polls for each election year and compare it to the actual value. If we did this, we would see a difference with a standard deviation of between 2-3%. FIX To incorporate this into the model, we can add another level account for this variability: 
+Now, in the model above, we assume the average house effect is 0. We think that for every pollster biased in favor of our party, there is another one in favor of the other, and assume the standard deviation is $\sigma_h$. But, historically, we see that every election has a general bias affecting all polls. We can observe this with the 2016 data, but if we collect historical data, we see that the average of polls misses by more than models like the one above predict. To see this, we would take the average of polls for each election year and compare it to the actual value. If we did this, we would see a difference with a standard deviation of between 2-3%. To account for this variability we can add another level to the model as follows: 
 
 $$
 \begin{aligned}
@@ -186,7 +186,7 @@ has expected value $\mu$; thus, it provides an unbiased estimate of the outcome
 It turns out that, because the $X_i$ are correlated, estimating the standard error is more complex than what we have described up to now. Specifically, using advanced statistical calculations not shown here, we can show that the typical variance (standard error squared) estimate:
 
 ```{r}
-s2 <- with(one_poll_per_pollster, sd(spread)^2 / length(spread))
+s2 <- with(one_poll_per_pollster, sd(spread)^2/length(spread))
 ```
 
 will consistently underestimate the true standard error by about $\sigma_b^2$. And, as mentioned earlier, to estimate $\sigma_b$, we need data from several elections. By collecting and analyzing polling data from several elections, FiveThirtyEight estimates this variability and finds that $\sigma_b \approx 0.025$. We can therefore greatly improve our standard error estimate by adding this quantity:
@@ -201,9 +201,9 @@ If we redo the Bayesian calculation taking this variability into account, we obt
 ```{r}
 mu <- 0
 tau <- 0.035
-B <- se^2 / (se^2 + tau^2)
-posterior_mean <- B*mu + (1-B)*x_bar
-posterior_se <- sqrt( 1/ (1/se^2 + 1/tau^2))
+B <- se^2/(se^2 + tau^2)
+posterior_mean <- B*mu + (1 - B)*x_bar
+posterior_se <- sqrt(1/(1/se^2 + 1/tau^2))
 
 1 - pnorm(0, posterior_mean, posterior_se)
 ```
@@ -212,7 +212,7 @@ Notice that by accounting for the general bias term, our Bayesian analysis now p
 
 
 :::{.callout-note}
-Keep in mind that we are simplifying FiveThirtyEight's calculations related to the general bias $b$. For example, one of the many ways their analysis is more complex than the one presented here, FIX ADD "TO"? is that they permit $b$ vary across regions of the country. This helps because, historically, we have observed geographical patterns in voting behaviors.
+Keep in mind that we are simplifying FiveThirtyEight's calculations related to the general bias $b$. For example, one of the many ways their analysis is more complex than the one presented here is that  FiveThirtyEight permits $b$ to vary across regions of the country. This helps because, historically, we have observed geographical patterns in voting behaviors.
 :::
 
 ## Predicting the electoral college
@@ -233,9 +233,9 @@ We are now ready to predict the electoral college result for 2016. We start by a
 
 ```{r}
 results <- polls_us_election_2016 |>
-  filter(state!="U.S." & 
+  filter(state != "U.S." & 
            !grepl("CD", state) & 
-           enddate >="2016-10-31" & 
+           enddate >= "2016-10-31" & 
            (grade %in% c("A+","A","A-","B+") | is.na(grade))) |>
   mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100) |>
   group_by(state) |>
@@ -277,18 +277,18 @@ To make probabilistic arguments, we will use a Monte Carlo simulation. For each
 mu <- 0
 tau <- 0.02
 results |> mutate(sigma = sd/sqrt(n), 
-                   B = sigma^2 / (sigma^2 + tau^2),
-                   posterior_mean = B * mu + (1 - B) * avg,
-                   posterior_se = sqrt(1/ (1/sigma^2 + 1/tau^2)))
+                   B = sigma^2/(sigma^2 + tau^2),
+                   posterior_mean = B*mu + (1 - B)*avg,
+                   posterior_se = sqrt(1/(1/sigma^2 + 1/tau^2)))
 ```
 
-FIX The estimates based on posterior do move the estimates towards 0, although the states with many polls are influenced less. This is expected as the more poll data we collect, the more we trust those results:
+The estimates based on posterior do move the estimates towards 0, although the states with many polls are influenced less. This is expected as the more poll data we collect, the more we trust those results:
 
 ```{r posterior-versus-original-estimates, echo=FALSE}
-results |> mutate(sigma = sd / sqrt(n), 
-                   B = sigma^2 / (sigma^2 + tau^2),
-                   posterior_mean = B * mu + (1 - B) * avg,
-                   posterior_se = sqrt(1/ (1/sigma^2 + 1/tau^2))) |>
+results |> mutate(sigma = sd/sqrt(n), 
+                   B = sigma^2/(sigma^2 + tau^2),
+                   posterior_mean = B*mu + (1 - B)*avg,
+                   posterior_se = sqrt(1/(1/sigma^2 + 1/tau^2))) |>
   ggplot(aes(avg, posterior_mean, size = n)) + geom_point() + 
   geom_abline(slope = 1, intercept = 0)
 ```
@@ -302,8 +302,8 @@ tau <- 0.02
 clinton_EV <- replicate(B, {
   results |> mutate(sigma = sd/sqrt(n), 
                    B = sigma^2 / (sigma^2 + tau^2),
-                   posterior_mean = B * mu + (1 - B) * avg,
-                   posterior_se = sqrt(1 / (1/sigma^2 + 1/tau^2)),
+                   posterior_mean = B*mu + (1 - B)*avg,
+                   posterior_se = sqrt(1/(1/sigma^2 + 1/tau^2)),
                    result = rnorm(length(posterior_mean), 
                                   posterior_mean, posterior_se),
                    clinton = ifelse(result > 0, electoral_votes, 0)) |> 
@@ -316,19 +316,21 @@ mean(clinton_EV > 269)
 
 This model gives Clinton over 99% chance of winning. A similar prediction was made by the Princeton Election Consortium. We now know it was quite off. What happened?
 
-The model above ignores the general bias and assumes the results from different states are independent. After the election, we realized that the general bias in 2016 was not that big: it was between 1 and 2%. But because the election was close in several big states and these states had a large number of polls, pollsters that ignored the general bias greatly underestimated the standard error. FIX 'WITH WHICH' READS FUNNY Using the notation we introduced, they assumed the standard error was $\sqrt{\sigma^2/N}$ which with large N is quite smaller than the more accurate estimate $\sqrt{\sigma^2/N + \sigma_b^2}$. FiveThirtyEight, which models the general bias in a rather sophisticated way, reported a closer result. We can simulate the results now with a bias term. For the state level, the general bias can be larger so we set it at $\sigma_b = 0.03$:
+The model above ignores the general bias and assumes the results from different states are independent. After the election, we realized that the general bias in 2016 was not that big: it was between 1 and 2%. But because the election was close in several big states and these states had a large number of polls, pollsters that ignored the general bias greatly underestimated the standard error. Using the notation we introduced, they assumed the standard error was $\sqrt{\sigma^2/N}$. With large $N$, this estimate is substiantially closer to 0 than the more accurate estimate $\sqrt{\sigma^2/N + \sigma_b^2}$. 
+
+FiveThirtyEight, which models the general bias in a rather sophisticated way, reported a closer result. We can simulate the results now with a bias term. For the state level, the general bias can be larger so we set it at $\sigma_b = 0.03$:
 
 ```{r  election-forecast-posterior-with-bias, , cache=TRUE}
 tau <- 0.02
 bias_sd <- 0.03
 clinton_EV_2 <- replicate(1000, {
   results |> mutate(sigma = sqrt(sd^2/n  + bias_sd^2),  
-                   B = sigma^2 / (sigma^2 + tau^2),
-                   posterior_mean = B*mu + (1-B)*avg,
-                   posterior_se = sqrt( 1/ (1/sigma^2 + 1/tau^2)),
+                   B = sigma^2/(sigma^2 + tau^2),
+                   posterior_mean = B*mu + (1 - B)*avg,
+                   posterior_se = sqrt(1/(1/sigma^2 + 1/tau^2)),
                    result = rnorm(length(posterior_mean), 
                                   posterior_mean, posterior_se),
-                   clinton = ifelse(result>0, electoral_votes, 0)) |> 
+                   clinton = ifelse(result > 0, electoral_votes, 0)) |> 
     summarize(clinton = sum(clinton) + 7) |> 
     pull(clinton)
 })
@@ -338,11 +340,11 @@ mean(clinton_EV_2 > 269)
 This gives us a much more sensible estimate. Looking at the outcomes of the simulation, we see how the bias term adds variability to the final results.
 
 ```{r comparison-forecast-with-and-without-bias, echo=FALSE}
-data.frame(no_bias=clinton_EV, with_bias=clinton_EV_2) |> gather(approach, result) |>
+data.frame(no_bias = clinton_EV, with_bias = clinton_EV_2) |> gather(approach, result) |>
   ggplot(aes(result)) + 
   geom_histogram(binwidth = 1) +
   geom_vline(xintercept = 269) +
-  facet_grid(approach~., scales="free")
+  facet_grid(approach ~ ., scales = "free")
 ```
 
 FiveThirtyEight includes many other features we do not include here. One is that they model variability with distributions that have high probabilities for extreme events compared to the normal. One way we could do this is by changing the distribution used in the simulation from a normal distribution to a t-distribution. FiveThirtyEight predicted a probability of 71%.
@@ -364,8 +366,7 @@ Since there is no pollster effect, then perhaps the theoretical standard error m
 ```{r}
 se <- one_pollster |> 
   summarize(empirical = sd(spread), 
-            theoretical = 2 * sqrt(mean(spread) * (1 - mean(spread)) /
-                                     min(samplesize)))
+            theoretical = 2*sqrt(mean(spread)*(1 - mean(spread))/min(samplesize)))
 se
 ```
 
@@ -389,12 +390,12 @@ Some of the peaks and valleys we see coincide with events such as the party conv
 polls_us_election_2016 |>
   filter(state == "U.S.") |>
   group_by(pollster) |>
-  filter(n()>=10) |>
+  filter(n() >= 10) |>
   ungroup() |>
   mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100) |>
   ggplot(aes(enddate, spread)) + 
   geom_smooth(method = "loess", span = 0.1) + 
-  geom_point(aes(color=pollster), show.legend = FALSE, alpha=0.6) 
+  geom_point(aes(color = pollster), show.legend = FALSE, alpha = 0.6) 
 ```
 
 This implies that if we are going to forecast, our model must include a term to accounts for the time effect. We need to write a model including a bias term for time, denoted as $b_t$.
@@ -405,16 +406,16 @@ We usually see the trend estimate not for the difference, but for the actual per
 
 ```{r trend-estimate-for-all-pollsters, warning=FALSE, message=FALSE, echo=FALSE}
 polls_us_election_2016 |>
-  filter(state == "U.S." & enddate>="2016-07-01") |>
+  filter(state == "U.S." & enddate >= "2016-07-01") |>
   select(enddate, pollster, rawpoll_clinton, rawpoll_trump) |>
   rename(Clinton = rawpoll_clinton, Trump = rawpoll_trump) |>
   gather(candidate, percentage, -enddate, -pollster) |> 
-  mutate(candidate = factor(candidate, levels = c("Trump","Clinton")))|>
+  mutate(candidate = factor(candidate, levels = c("Trump", "Clinton"))) |>
   group_by(pollster) |>
-  filter(n()>=10) |>
+  filter(n() >= 10) |>
   ungroup() |>
   ggplot(aes(enddate, percentage, color = candidate)) +  
-  geom_point(show.legend = FALSE, alpha=0.4)  + 
+  geom_point(show.legend = FALSE, alpha = 0.4)  + 
   geom_smooth(method = "loess", span = 0.15) +
   scale_y_continuous(limits = c(30,50))
 ```
@@ -446,9 +447,9 @@ cis <- cis |>
   left_join(add, by = "state")
 ```
 
-Now, determine how often the 95% confidence interval includes the actual result.
+Now, determine how often the 95% confidence interval includes the election night result stored in `actual_spread`.
 
-3\. Repeat this, but show the proportion of hits for each pollster. Show only pollsters with more than 5 polls and order them from best to worst. Show the number of polls conducted by each pollster and the FiveThirtyEight grade of each pollster. Hint: Use `n=n(), grade = grade[1]` in the call to summarize.
+3\. Repeat this, but show the proportion of hits for each pollster. Consider only pollsters with more than 5 polls and order them from best to worst. Show the number of polls conducted by each pollster and the FiveThirtyEight grade of each pollster. Hint: Use `n=n(), grade = grade[1]` in the call to summarize.
 
 4\. Repeat exercise 3, but instead of pollster, stratify by state. Note that here we can't show grades.
 
@@ -456,16 +457,15 @@ Now, determine how often the 95% confidence interval includes the actual result.
 
 6\. Add two columns to the `cis` table by computing, for each poll, the difference between the predicted spread and the actual spread, and define a column `hit` that is true if the signs are the same. Hint: Use the function `sign`. Call the object `resids`.
 
-7\. FIX SIGN OF THE SPREAD' Create a plot like in exercise 5, but for the proportion of times the sign of the spread agreed.
+7\. Create a plot like in exercise 5, but for the proportion of times the sign of the spread agreed with the election night result.
 
 8\. In exercise 7, we see that for most states the polls had it right 100% of the time. For only 9 states did the polls miss more than 25% of the time. In particular, notice that in Wisconsin every single poll got it wrong. In Pennsylvania and Michigan, more than 90% of the polls had the signs wrong. Make a histogram of the errors. What is the median of these errors?
 
-9\. We see that at the state level, the median error was 3% in favor of Clinton. The distribution is not centered at 0, but at 0.03. FIX IN THE SECTION ABOVE? This is the general bias we described in the section above. Create a boxplot to see if the bias was general to all states or it affected some states differently. Use `filter(grade %in% c("A+","A","A-","B+") | is.na(grade)))` to only include pollsters with high grades.
+9\. We see that at the state level, the median error was 3% in favor of Clinton. The distribution is not centered at 0, but at 0.03. This related to the _general bias_ described in @sec-general-bias. Create a boxplot to see if the bias was general to all states or it affected some states differently. Use `filter(grade %in% c("A+","A","A-","B+") | is.na(grade)))` to only include pollsters with high grades.
 
 10\. Some of these states only have a few polls. Repeat exercise 9, but only include states with 5 good polls or more. Hint: Use `group_by`, `filter` then `ungroup`. You will see that the West (Washington, New Mexico, California) underestimated Hillary's performance, while the Midwest (Michigan, Pennsylvania, Wisconsin, Ohio, Missouri) overestimated it. In our simulation, we did not model this behavior since we added general bias, rather than a regional bias. Note that some pollsters may now be modeling correlation between similar states and estimating this correlation from historical data. To learn more about this, you can explore concepts related to random effects and mixed models.
 
 
-
 11\. In April 2013, José Iglesias, a professional baseball player was starting his career. He was performing exceptionally well, with an excellent _batting average_ (AVG) of .450. The batting average statistic is one way of measuring success. Roughly speaking, it tells us the success rate when batting. José had 9 successes out of 20 tries. An AVG of .450 means José has been successful 45% of the times he has batted, which is rather high historically speaking. In fact, no one has finished a season with an `AVG` of .400 or more since Ted Williams did it in 1941! We want to predict José's batting average at the end of the season after players have had about 500 tries or _at bats_. With the frequentist techniques, we have no choice but to predict that his AVG will be .450 at the end of the season. Compute a confidence interval for the success rate.
 
 12\. Despite the frequentist prediction of $.450$, not a single baseball enthusiast would make this prediction. Why is this? One reason is that they know the estimate has much uncertainty. However, the main reason is that they are implicitly using a hierarchical model that factors in information from years of following baseball. Use the following code to explore the distribution of batting averages in the three seasons prior to 2013, and describe what this tells us.
diff --git a/inference/hypothesis-testing.qmd b/inference/hypothesis-testing.qmd
index 7e30c9d..b573bb5 100644
--- a/inference/hypothesis-testing.qmd
+++ b/inference/hypothesis-testing.qmd
@@ -19,7 +19,7 @@ Suppose we take a random sample of $N=100$ and we observe $52$ blue beads, which
 We have observed a random variable $\bar{X} = 0.52$, and the p-value is the answer to the question: How likely is it to see a value this large, when the null hypothesis is true? If the p-value is small enough, we _reject the null hypothesis_ and say that the results are _statistically significant_. 
 
 :::{callout-warning}
-The p-value of 0.05 as a threshold for statistical significance is conventionally used in many areas of research. A cutoff of 0.01 is also used to define _highly significance_. The choice of 0.05 is somewhat arbitrary and was popularized by the British statistician Ronald Fisher in the 1920s. FIX SIGNFICANT? OR STATISTICAL SIGNIFICANCE We do not recommend using the cutoff without justification and try to avoid the phrase _statistically significance_. 
+The p-value of 0.05 as a threshold for statistical significance is conventionally used in many areas of research. A cutoff of 0.01 is also used to define _highly significance_. The choice of 0.05 is somewhat arbitrary and was popularized by the British statistician Ronald Fisher in the 1920s. We do not recommend using these cutoff without justification and recommend avoiding the phrase _statistically significant_. 
 :::
 
 To obtain a p-value for our example, we write:
@@ -60,7 +60,7 @@ Pollsters are not successful at providing correct confidence intervals, but rath
 ```{r}
 N <- 25
 x_hat <- 0.48
-(2 * x_hat - 1) + c(-1.96, 1.96) * 2 * sqrt(x_hat * (1 - x_hat) / N)
+(2*x_hat - 1) + c(-1.96, 1.96)*2*sqrt(x_hat*(1 - x_hat)/N)
 ```
 
 included 0. If this were a poll and we were forced to make a declaration, we would have to say it was a "toss-up".
@@ -74,7 +74,7 @@ By increasing our sample size, we lower our standard error, and thus, have a muc
 
 ## Exercises
 
-1. Generate a sample of size $N=50$ from an urn model with 50% blue beads:
+1. Generate a sample of size $N=1000$ from an urn model with 50% blue beads:
 
 ```{r, eval=FALSE}
 N <- 1000
@@ -82,7 +82,7 @@ p <- 0.5
 x <- rbinom(N, 1, 0.5)
 ```
 
-FIX CHECK OK Then, compute a p-value to test if $p=0.5$. FIX Repeat this 10,000 times and report how often do we incorrectly is the p-value lower than 0.05? How often is it lower than 0.01?
+then, compute a p-value to test if $p=0.5$. Repeat this 10,000 times and report how often the p-value is lower than 0.05? How often is it lower than 0.01?
 
 2. Make a histogram of the p-values you generated in exercise 1. Which of the following seems to be true?
 
@@ -92,17 +92,17 @@ FIX CHECK OK Then, compute a p-value to test if $p=0.5$. FIX Repeat this 10,000
   d. The p-values are all less than 0.05.
 
 
-3. FIX (Advanced) Demonstrate mathematically why see the histogram you see.
+3. Demonstrate, mathematically, why see the histogram we see in exercise 2.
 
-4.  Generate a sample of size $N=50$ from an urn model with 52% blue beads:
+4.  Generate a sample of size $N=1000$ from an urn model with 52% blue beads:
 
 ```{r, eval=FALSE}
-N <- 50 
+N <- 1000 
 p <- 0.52
 x <- rbinom(N, 1, 0.5)
 ```
 
-FIX Then, compute a p-value to test if $p=0.5$. FIX Repeat this 10,000 times and report how often do we incorrectly is the p-value larger than 0.05? Note that you are computing 1 - power. 
+Compute a p-value to test if $p=0.5$. Repeat this 10,000 times and report how often the p-value is larger than 0.05? Note that you are computing 1 - power. 
 
 5. Repeat exercise for but for the following values:
 
diff --git a/inference/models.qmd b/inference/models.qmd
index 0a20faf..36248db 100644
--- a/inference/models.qmd
+++ b/inference/models.qmd
@@ -112,7 +112,7 @@ We have `r nrow(polls)` estimates of the spread. The theory we learned from samp
 
 ```{r}
 mu_hat <- polls |> 
-  summarize(mu_hat = sum(spread * samplesize) / sum(samplesize)) |> 
+  summarize(mu_hat = sum(spread*samplesize)/sum(samplesize)) |> 
   pull(mu_hat)
 ```
 
@@ -120,7 +120,7 @@ and the standard error is:
 
 ```{r}
 p_hat <- (mu_hat + 1)/2 
-moe <- 1.96 * 2 * sqrt(p_hat * (1 - p_hat) / sum(polls$samplesize))
+moe <- 1.96*2*sqrt(p_hat*(1 - p_hat)/sum(polls$samplesize))
 moe
 ```
 
@@ -157,10 +157,12 @@ This plot reveals an unexpected result. First, consider that the standard error
 ```{r}
 polls |> group_by(pollster) |> 
   filter(n() >= 6) |>
-  summarize(se = 2 * sqrt(p_hat * (1 - p_hat) / median(samplesize)))
+  summarize(se = 2*sqrt(p_hat*(1 - p_hat)/median(samplesize)))
 ```
 
-This agrees with the within poll variation we see. However, there appears to be differences *across the polls*. Observe, for example, how the USC Dornsife/LA Times pollster is predicting a FIX 4% win for Trump, while Ipsos is predicting a win larger than 5% for Clinton. The theory we learned says nothing about different pollsters producing polls with different expected values; all the polls should have the same expected value. FiveThirtyEight refers to these differences as _house effects_. We also call them _pollster bias_. Nothing in our simple urn model provides an explanation for these pollster-to-pollster differences. This model misspecification led to an overconfident interval that ended up not including the election night result. So, rather than modeling the process generating these values with an urn model, we instead model the pollster results directly. To do this, we start by collecting some data. Specifically, for each pollster, we look at the last reported result before the election:
+This agrees with the within poll variation we see. However, there appears to be differences *across the polls*. Observe, for example, how the USC Dornsife/LA Times pollster is predicting a 4% lead for Trump, while Ipsos is predicting a lead larger than 5% for Clinton. The theory we learned says nothing about different pollsters producing polls with different expected values, instead it assumes all the polls have the same expected value. FiveThirtyEight refers to these differences as _house effects_. We also call them _pollster bias_. Nothing in our simple urn model provides an explanation for these pollster-to-pollster differences. 
+
+This model misspecification led to an overconfident interval that ended up not including the election night result. So, rather than modeling the process generating these values with an urn model, we instead model the pollster results directly. To do this, we start by collecting some data. Specifically, for each pollster, we look at the last reported result before the election:
 
 ```{r}
 one_poll_per_pollster <- polls |> group_by(pollster) |> 
@@ -219,10 +221,10 @@ We are now ready to form a new confidence interval based on our new data-driven
 ```{r}
 results <- one_poll_per_pollster |> 
   summarize(avg = mean(spread), 
-            se = sd(spread) / sqrt(length(spread))) |> 
-  mutate(start = avg - 1.96 * se, 
-         end = avg + 1.96 * se) 
-round(results * 100, 1)
+            se = sd(spread)/sqrt(length(spread))) |> 
+  mutate(start = avg - 1.96*se, 
+         end = avg + 1.96*se) 
+round(results*100, 1)
 ```
 
 Our confidence interval is wider now since it incorporates the pollster variability. It does include the election night result of 2.1%. Also, note that it was small enough not to include 0, which means we were confident Clinton would win the popular vote.
@@ -264,8 +266,8 @@ $$
 This is referred to as a *t-statistic*. By substituting $\sigma$ with $s$, we introduce some variability. The theory tells us that $t$ follows a *student t-distribution* with $N-1$ *degrees of freedom*. The degrees of freedom is a parameter that controls the variability via fatter tails:
 
 ```{r t-distribution-examples, echo=FALSE}
-x <- seq(-5,5, len=100)
-data.frame(x=x, Normal = dnorm(x, 0, 1), t_03 = dt(x,3), t_05 = dt(x,5), t_15=dt(x,15)) |> gather(distribution, f, -x) |> ggplot(aes(x,f, color = distribution)) + geom_line() +ylab("f(x)")
+x <- seq(-5, 5, len = 100)
+data.frame(x = x, Normal = dnorm(x, 0, 1), t_03 = dt(x, 3), t_05 = dt(x, 5), t_15 = dt(x, 15)) |> gather(distribution, f, -x) |> ggplot(aes(x, f, color = distribution)) + geom_line() + ylab("f(x)")
 ```
 
 If we are willing to assume the pollster effect data is normally distributed, based on the sample data $X_1, \dots, X_N$,
@@ -304,11 +306,11 @@ This results in a slightly larger confidence interval than we obtained before:
 n <- length(one_poll_per_pollster$spread)
 ttest_ci <- one_poll_per_pollster |> 
   summarize(avg = mean(spread), 
-            se = sd(spread) / sqrt(length(spread))) |> 
-  mutate(start = avg - qt(0.975, n - 1) * se, 
-         end = avg + qt(0.975, n - 1) * se) |>
+            se = sd(spread)/sqrt(length(spread))) |> 
+  mutate(start = avg - qt(0.975, n - 1)*se, 
+         end = avg + qt(0.975, n - 1)*se) |>
   select(start, end)
-round(ttest_ci * 100, 1)
+round(ttest_ci*100, 1)
 ```
 
 Note that using the t-distribution and the t-statistic is the basis for *t-tests*, a widely used approach for computing p-values. To learn more about t-tests, you can consult any statistics textbook.
@@ -334,7 +336,7 @@ polls_us_election_2016 |>
 
 We have been using urn models to motivate the use of probability models. Yet, most data science applications are not related to data obtained from urns. More common are data that come from individuals. The reason probability plays a role here is because the data come from a random sample. The random sample is taken from a population, and the urn serves as an analogy for the population.
 
-FIX Let's revisit the heights dataset. Suppose we consider the males in our course the population.
+Define the males that replied to the height survey as the population
 
 ```{r, eval=FALSE}
 library(dslabs)
@@ -342,6 +344,8 @@ x <- heights |> filter(sex == "Male") |>
   pull(height)
 ```
 
+to answer the following questions.
+
 1\. Mathematically speaking, `x` is our population. Using the urn analogy, we have an urn with the values of `x` in it. What are the average and standard deviation of our population?
 
 2\. Call the population average computed above $\mu$ and the standard deviation $\sigma$. Now take a sample of size 50, with replacement, and construct an estimate for $\mu$ and $\sigma$.
diff --git a/inference/parameters-estimates.qmd b/inference/parameters-estimates.qmd
index 16760a6..3933cd7 100644
--- a/inference/parameters-estimates.qmd
+++ b/inference/parameters-estimates.qmd
@@ -15,7 +15,7 @@ img_path <- "img"
 
 We start by connecting probability theory to the task of using polls to learn about a population.
 
-FIX ARE WE TALKING ABOUT TWO DIFFERENT POLLS? Although typically the results of these polls are kept private, similar polls are conducted by news organizations because results tend to be of interest to the general public and made public. FIX WHICH DATA We will eventually be looking at such data.
+Although typically the results of polls run by political candidates are kept private, polls are also conducted by news organizations because results tend to be of interest to the general public and made public. We will eventually be looking at these public datasets.
 
 Real Clear Politics[^parameters-estimates-1] is an example of a news aggregator that organizes and publishes poll results. For example, they present the following poll results, reporting estimates of the popular vote for the 2016 presidential election[^parameters-estimates-2]:
 
@@ -37,7 +37,7 @@ tab <- tab |> mutate(Poll = stringr::str_remove(Poll, "\\/.*")) |>
     Poll == "CBS NewsCBS News" ~ "CBS News",
     TRUE ~ Poll))
 names(tab) <- stringr::str_remove_all(names(tab), "\\s(.*)")    
-if(knitr::is_html_output()){
+if (knitr::is_html_output()) {
   knitr::kable(tab, "html") |>
     kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)
 } else{
@@ -92,7 +92,7 @@ Can we do this with the 25 observations above? It is certainly informative. For
 We want to construct an estimate of $p$ using only the information we observe. An estimate should be thought of as a summary of the observed data that we think is informative about the parameter of interest. It seems intuitive to think that the proportion of blue beads in the sample $0.48$ must be at least related to the actual proportion $p$. But do we simply predict $p$ to be 0.48? First, remember that the sample proportion is a random variable. If we run the command `take_poll(25)` four times, we get a different answer each time, since the sample proportion is a random variable.
 
 ```{r four-simulated-polls, echo=FALSE}
-par(mfrow=c(2,2), mar = c(3, 1, 3, 0), mgp = c(1.5, 0.5, 0)) 
+par(mfrow = c(2,2), mar = c(3, 1, 3, 0), mgp = c(1.5, 0.5, 0)) 
 take_poll(25); take_poll(25); take_poll(25); take_poll(25)
 ```
 
@@ -100,7 +100,7 @@ Observe that in the four random samples shown above, the sample proportions rang
 
 ### The sample average
 
-FIX CHECK PARA Conducting an opinion poll is being modeled as taking a random sample from an urn. We propose using the proportion of blue beads in our sample as an *estimate* of the parameter $p$. Once we have this estimate, we can easily report an estimate for the spread $2p-1$. However, for simplicity, we will illustrate the concepts for estimating $p$. We will use our knowledge of probability to justify our use of the sample proportion and to quantify its proximity to the population proportion $p$.
+Conducting an opinion poll is being modeled as taking a random sample from an urn. We propose using the proportion of blue beads in our sample as an *estimate* of the parameter $p$. Once we have this estimate, we can easily report an estimate for the spread $2p-1$. However, for simplicity, we will illustrate the concepts for estimating $p$. We will use our knowledge of probability to justify our use of the sample proportion and to quantify its proximity to the population proportion $p$.
 
 We start by defining the random variable $X$ as $X=1$, if we pick a blue bead at random, and $X=0$ if it is red. This implies that the population is a list of 0s and 1s. If we sample $N$ beads, then the average of the draws $X_1, \dots, X_N$ is equivalent to the proportion of blue beads in our sample. This is because adding the $X$s is equivalent to counting the blue beads, and dividing this count by the total $N$ is equivalent to computing a proportion. We use the symbol $\bar{X}$ to represent this average. In statistics textbooks, a bar on top of a symbol typically denotes the average. The theory we just covered about the sum of draws becomes useful because the average is a sum of draws multiplied by the constant $1/N$:
 
@@ -112,9 +112,13 @@ Here, we encounter an important difference compared to what we did in the sectio
 
 ### Parameters
 
-FIX CHECK SENT Just as we use variables to define unknowns in systems of equations, in statistical inference, we define *parameters* to represent unknown parts of our models. In the urn model, which we are using to simulate an opinion poll, we do not know the proportion of blue beads in the urn. We define the parameters $p$ to represent this quantity. $p$ is the average of the urn because if we take the average of the 1s (blue) and 0s (red), we get the proportion of blue beads. Since our main goal is determining $p$, we are going to *estimate this parameter*.
+Just as we use variables to define unknowns in systems of equations, in statistical inference, we define *parameters* to represent unknown parts of our models. In the urn model, which we are using to simulate an opinion poll, we do not know the proportion of blue beads in the urn. We define the parameters $p$ to represent this quantity. Since our main goal is determining $p$, we are going to *estimate this parameter*.
 
-FIX PARA The concepts presented here on how we estimate parameters, and provide insights into how good these estimates are, extend to many data science tasks. For example, we may want to determine the difference in health improvement between patients receiving treatment and a control group, investigate the health effects of smoking on a population, analyze the differences in racial groups of fatal shootings by police, or assess the rate of change in life expectancy in the US during the last 10 years. All these questions can be framed as a task of estimating a parameter from a sample.
+:::{.callout-note}
+Introductory statistics textbooks usually use the population average as the first example of a parameter. Note that in our example the parameter of interest $p$ is defined by the proportion of 1s (blue) and 0s (red) in the urn, which is also the average of the numbers in the urn. Our parameter of interest can therefore be thought of as a population average.
+:::
+
+The concepts presented here on how we estimate parameters, and provide insights into how good these estimates are, extend to many data analysis tasks. For example, we may want to determine the difference in health improvement between patients receiving treatment and a control group, investigate the health effects of smoking on a population, analyze the differences in racial groups of fatal shootings by police, or assess the rate of change in life expectancy in the US during the last 10 years. All these questions can be framed as a task of estimating a parameter from a sample.
 
 ## Polling versus forecasting
 
@@ -179,6 +183,6 @@ or 1.5 percentage points. So even with large polls, for close elections, $\bar{X
 10\. Given the answer to exercise 9, which of the following best describes your strategy of using a sample size of $N=25$?
 
 a.  The expected value of our estimate $2\bar{X}-1$ is $\mu$, so our prediction will be accurate.
-b.  FIX 'THROWING US OFF' HOW ABOUT GIVING US AN INACCURATE PREDICTION? Our standard error is larger than the difference, so the chances of $2\bar{X}-1$ being positive and throwing us off were not that small. We should pick a larger sample size.
+b.  Our standard error is larger than the difference, so the chances of $2\bar{X}-1$ representing a large margin are not small. We should pick a larger sample size.
 c.  The difference is 10% and the standard error is about 0.2, therefore much smaller than the difference.
 d.  Because we don't know $p$, we have no way of knowing that making $N$ larger would actually improve our standard error.
diff --git a/intro.qmd b/intro.qmd
index 3828734..1dcf280 100644
--- a/intro.qmd
+++ b/intro.qmd
@@ -1,16 +1,22 @@
 # Introduction {.unnumbered}
 
-Over the years, data analysts have developed ideas, concepts, and methodologies applicable across a broad range of projects. They've also identified common ways to get fooled by apparent patterns in the data and important mathematical realities that are not immediately obvious.  This collective wisdom has evolved into the field of Statistics, a discipline offering a mathematical framework to simplify the articulation and rigorous assessment of these concepts. For a data analyst, it's crucial to have a comprehensive understanding of this field to prevent repeated errors and unnecessary reinvention of methodologies.
+The phrase _data science_ began gaining significant popularity around 2012, thanks in part to the publication  titled _"Data Scientist: The Most Alluring Profession of the 21st Century"_^[https://hbr.org/2012/10/data-scientist-the-sexiest-job-of-the-21st-century]. This aligns with the rise of a new kind of endeavor in the technology sector and some academic projects during the 2000s: the extraction of insights from messy, complex, and large datasets, which had become increasingly prevalent, all mad possible with the advent of digital storage of data.
 
-There is no shortage of exceptional Statistics textbooks detailing this mathematical framework. However, in this book, we emphasize bridging theory and practice, applying these concepts to actual real-world challenges using data examples and in-depth case studies. We provide  representative case studies that mirror what a practicing data analyst experiences. These include election forecasting, baseball team construction, biology experiments, movie recommendation systems, and deciphering hand-written digits. In each case study, we present and break down the R code applied to solve the problem. We also use R code to elucidate key statistical concepts often discussed in a mathematical context.
+Some examples include using data from various political pollsters to improve election predictions, extracting information from athletic department websites to evaluate baseball prospects, analyzing movie ratings from all streaming service users to make personalized recommendations, developing software to read zip codes by digitizing written digits, and using advanced measurement technologies to understand the molecular causes of diseases. This book is centered around these, and other practical examples.
 
-The book is divided into six sections: **Summary Statistics**, **Probability**, **Statistical Inference**, **Linear Models**, **High Dimensional Data** and **Machine Learning**. Although the the first two parts use data examples to illustrate concepts, real-world case studies don't appear until the third part. Each part comprises several chapters, each roughly designed for a single lecture and including a variety of exercises. All data referenced in the book is included in the **dslabs** package with all the Quarto code used to generate the book available on  [GitHub](https://github.com/rafalab/dsbook-part-2)^[https://github.com/rafalab/dsbook-part-2].
+Achieving success in these instances involves a collaborative effort by a team of experts with different but complementary skills. In this book, our primary focus is on data analysis. To grasp the best ways to analyze data effectively in the mentioned examples, we will cover  key mathematical concepts. Some of these concepts are not new and were originally developed for different purposes, but they have proven to be adaptable and useful in various contexts.
+
+Over the past several decades, data analysts have developed ideas, concepts, and methodologies applicable across a broad range of projects. They've also identified common ways to get fooled by apparent patterns in the data and important mathematical realities that are not immediately obvious. This collective wisdom has evolved into the field of Statistics, a discipline offering a mathematical framework to simplify the articulation and rigorous assessment of these concepts. For a data analyst, it's crucial to have a comprehensive understanding of this field to prevent repeated errors and unnecessary reinvention of methodologies.
+
+There is no shortage of exceptional Statistics textbooks detailing this mathematical framework. In this book, we emphasize bridging theory and practice, applying these concepts to actual real-world challenges using data examples and in-depth case studies. We provide  representative case studies that mirror what a practicing data analyst experiences. In each case study, we present and break down the R code applied to solve the problem. We also use R code to elucidate key statistical concepts often discussed in a mathematical context.
+
+The book is divided into six sections: **Summary Statistics**, **Probability**, **Statistical Inference**, **Linear Models**, **High Dimensional Data** and **Machine Learning**. Although the the first two parts use data examples to illustrate concepts, the real-world case studies don't appear until the third part. Each part comprises several chapters, each roughly designed for a single lecture and including a variety of exercises. All data referenced in the book is included in the **dslabs** package with all the Quarto code used to generate the book available on [GitHub](https://github.com/rafalab/dsbook-part-2)^[https://github.com/rafalab/dsbook-part-2].
 
 
 ## Who will find this book useful? {.unnumbered}
 
-This book is meant to be a textbook for a second course in Data Science. Previous knowledge of R, such as that covered in [Introduction to Data Science](http://rafalab.dfci.harvard.edu/dsbook-part-1/), is necessary. If you read and understand all the chapters and complete all the exercises in this book, you will be well-positioned to perform advanced data analysis tasks and you will be prepared to learn the more advanced concepts and skills needed to become an expert.
+This book is meant to be a textbook for a second course in Data Science with a focus on data analysis. Previous knowledge of R, such as that covered in [Introduction to Data Science](http://rafalab.dfci.harvard.edu/dsbook-part-1/), is necessary. If you read and understand all the chapters and complete all the exercises in this book, you will be well-positioned to perform advanced data analysis tasks and you will be prepared to learn the more advanced concepts and skills needed to become an expert.
 
 ## What is not covered by this book? {.unnumbered}
 
-This book focuses on the application of statistical and machine learning methods in data analysis. We do not go in depth into the theoretical aspects of the methods, and highly recommend complementing this book with probability and statistics textbooks.
+This book focuses on the application of statistical and machine learning methods in data analysis. We do not go in depth into the theoretical aspects of the methods, and highly recommend complementing this book with probability and statistics textbooks. We also do not cover aspects related to data management or engineering. Although R programming is an essential part of the book, we do not teach more advanced computer science topics such as data structures, optimization, and algorithm theory. Similarly, we do not cover topics such as web services, interactive graphics, parallel computing, and data streaming processing.
diff --git a/krantz.cls b/krantz.cls
new file mode 100644
index 0000000..160f1f7
--- /dev/null
+++ b/krantz.cls
@@ -0,0 +1,2395 @@
+%%
+%% This is file `Krantz.cls'
+%%% Created by Shashi Kumar / KGL [Jan 2023]
+
+
+\NeedsTeXFormat{LaTeX2e}[1995/12/01]
+\ProvidesClass{krantz}
+              [2021/10/04 v1.4n
+ Standard LaTeX document class]
+\newcommand\@ptsize{}
+\newif\if@restonecol
+\newif\if@titlepage
+\@titlepagetrue
+\newif\if@openright
+\newif\if@mainmatter \@mainmattertrue
+\if@compatibility\else
+  \DeclareOption{a4paper}
+     {\setlength\paperheight {297mm}%
+      \setlength\paperwidth  {210mm}}
+  \DeclareOption{a5paper}
+     {\setlength\paperheight {210mm}%
+      \setlength\paperwidth  {148mm}}
+  \DeclareOption{b5paper}
+     {\setlength\paperheight {250mm}%
+      \setlength\paperwidth  {176mm}}
+  \DeclareOption{letterpaper}
+     {\setlength\paperheight {11in}%
+      \setlength\paperwidth  {8.5in}}
+  \DeclareOption{legalpaper}
+     {\setlength\paperheight {14in}%
+      \setlength\paperwidth  {8.5in}}
+  \DeclareOption{executivepaper}
+     {\setlength\paperheight {10.5in}%
+      \setlength\paperwidth  {7.25in}}
+  \DeclareOption{landscape}
+     {\setlength\@tempdima   {\paperheight}%
+      \setlength\paperheight {\paperwidth}%
+      \setlength\paperwidth  {\@tempdima}}
+\fi
+\if@compatibility
+  \renewcommand\@ptsize{0}
+\else
+  \DeclareOption{10pt}{\renewcommand\@ptsize{0}}
+\fi
+\DeclareOption{11pt}{\renewcommand\@ptsize{1}}
+\DeclareOption{12pt}{\renewcommand\@ptsize{2}}
+\if@compatibility\else
+  \DeclareOption{oneside}{\@twosidefalse \@mparswitchfalse}
+\fi
+\DeclareOption{twoside}{\@twosidetrue  \@mparswitchtrue}
+\DeclareOption{draft}{\setlength\overfullrule{5pt}}
+\if@compatibility\else
+  \DeclareOption{final}{\setlength\overfullrule{0pt}}
+\fi
+\DeclareOption{titlepage}{\@titlepagetrue}
+\if@compatibility\else
+  \DeclareOption{notitlepage}{\@titlepagefalse}
+\fi
+\if@compatibility
+\@openrighttrue
+\else
+\DeclareOption{openright}{\@openrighttrue}
+\DeclareOption{openany}{\@openrightfalse}
+\fi
+\if@compatibility\else
+  \DeclareOption{onecolumn}{\@twocolumnfalse}
+\fi
+\DeclareOption{twocolumn}{\@twocolumntrue}
+\DeclareOption{leqno}{\input{leqno.clo}}
+\DeclareOption{fleqn}{\input{fleqn.clo}}
+\DeclareOption{openbib}{%
+  \AtEndOfPackage{%
+    \renewcommand\@openbib@code{%
+      \advance\leftmargin\bibindent
+      \itemindent -\bibindent
+      \listparindent \itemindent
+      \parsep \z@
+      }%
+    \renewcommand\newblock{\par}}%
+}
+
+
+%%%%%%%%%%%
+\newif\if@numbysec
+\DeclareOption{numbysec}{\@numbysectrue}
+\newif\if@numberinsequence
+\DeclareOption{numberinsequence}{\@numberinsequencetrue}
+\newif\if@nocaptionbreak
+\DeclareOption{NoCaptionBreak}{\@nocaptionbreaktrue}
+\newif\if@sevenbyten
+\DeclareOption{sevenbyten}{\@sevenbytentrue}
+\newif\if@cip
+\DeclareOption{cip}{\@ciptrue}
+\newif\if@times
+\DeclareOption{times}{\@timestrue}
+\newif\if@chapnumonly
+\DeclareOption{chapnumonly}{\@chapnumonlytrue}
+\newif\if@ChapterResetsPage
+\DeclareOption{ChapterResetsPage}{\@ChapterResetsPagetrue}
+\newif\if@ChapterTOCs
+\DeclareOption{ChapterTOCs}{\@ChapterTOCstrue}
+\newif\if@EOCRefs
+\DeclareOption{EOCRefs}{\@EOCRefstrue}%
+\newif\if@SuperscriptCites
+\DeclareOption{SuperscriptCites}{\@SuperscriptCitestrue}%
+\newif\if@UnnumberedReferences
+\DeclareOption{UnnumberedReferences}{\@UnnumberedReferencestrue}%
+\newif\if@pdf
+\DeclareOption{pdf}{\@pdftrue}
+\DeclareOption{krantz1}{\@krantzatrue}
+\newif\if@krantza
+\DeclareOption{krantz2}{\@krantzbtrue}
+\newif\if@krantzb
+%%%%%%%%%%%%%%%%%%
+
+
+\ExecuteOptions{letterpaper,10pt,twoside,onecolumn,final,openright}
+\ProcessOptions
+
+
+%%%%%%%%%%%%%%%%%%%
+
+\def\helv@scale{.82}
+%
+\DeclareFontFamily{T1}{helvetica}{}%
+\DeclareFontShape{T1}{helvetica}{m}{n}{<->s*[\helv@scale]phvr8t}{}%
+\DeclareFontShape{T1}{helvetica}{m}{it}{<->s*[\helv@scale]phvro8t}{}%
+\DeclareFontShape{T1}{helvetica}{m}{sc}{<->s*[\helv@scale]phvrc8t}{}%
+\DeclareFontShape{T1}{helvetica}{b}{n}{<->s*[\helv@scale]phvb8t}{}%
+\DeclareFontShape{T1}{helvetica}{b}{it}{<->s*[\helv@scale]phvbo8t}{}%
+\DeclareFontShape{T1}{helvetica}{m}{sl}{<->s*[\helv@scale]phvro8t}{}%
+\DeclareFontShape{T1}{helvetica}{b}{sc}{<->s*[\helv@scale]phvbc8t}{}%
+\DeclareFontShape{T1}{helvetica}{b}{sl}{<->s*[\helv@scale]phvbo8t}{}%
+\DeclareFontShape{T1}{helvetica}{bx}{n}{<->s*[\helv@scale]phvb8t}{}%
+\DeclareFontShape{T1}{helvetica}{bx}{it}{<->s*[\helv@scale]phvbo8t}{}%
+\DeclareFontShape{T1}{helvetica}{bx}{sc}{<->s*[\helv@scale]phvbc8t}{}%
+\DeclareFontShape{T1}{helvetica}{bx}{sl}{<->ssub * helvetica/b/it}{}%
+
+\DeclareFontFamily{OT1}{helvetica}{}%
+\DeclareFontShape{OT1}{helvetica}{m}{n}{<->s*[\helv@scale]phvr7t}{}%
+\DeclareFontShape{OT1}{helvetica}{m}{it}{<->s*[\helv@scale]phvro7t}{}%
+\DeclareFontShape{OT1}{helvetica}{m}{sc}{<->s*[\helv@scale]phvrc7t}{}%
+\DeclareFontShape{OT1}{helvetica}{b}{n}{<->s*[\helv@scale]phvb7t}{}%
+\DeclareFontShape{OT1}{helvetica}{b}{it}{<->s*[\helv@scale]phvbo7t}{}%
+\DeclareFontShape{OT1}{helvetica}{m}{sl}{<->s*[\helv@scale]phvro7t}{}%
+\DeclareFontShape{OT1}{helvetica}{b}{sc}{<->s*[\helv@scale]phvbc8t}{}%
+\DeclareFontShape{OT1}{helvetica}{b}{sl}{<->s*[\helv@scale]phvbo7t}{}%
+\DeclareFontShape{OT1}{helvetica}{bx}{n}{<->s*[\helv@scale]phvb7t}{}%
+\DeclareFontShape{OT1}{helvetica}{bx}{it}{<->s*[\helv@scale]phvbo7t}{}%
+\DeclareFontShape{OT1}{helvetica}{bx}{sc}{<->s*[\helv@scale]phvbc8t}{}%
+\DeclareFontShape{OT1}{helvetica}{bx}{sl}{<->s*[\helv@scale]phvbo7t}{}%
+
+%%%%%%%%%%%%%%%%%%%%%
+
+%%%%%%%%%%%%%% Font Defined %%%%%%%%%%%%%%%%%
+
+ \def\@xipt{11}
+  \def\@xviiipt{18}
+ \def\@xxivpt{24}
+
+
+\newcommand\ContributorAffiliationFont{\reset@font\fontsize{10}{12}\raggedright\selectfont}
+\newcommand\ContributorNameFont{\reset@font\fontsize{10}{12}\bfseries\raggedright\selectfont}
+
+\newcommand\TitlePageTitleFont{\fontsize{24}{28}\slshape\bfseries\selectfont}
+\newcommand\TitlePageSubtitleFont{\fontsize{21}{26}\slshape\selectfont}
+\newcommand\PageNumFont{\reset@font\fontsize{10}{12}\selectfont}
+\newcommand\ChapNumFont{\reset@font\fontsize{24}{24}\bfseries\selectfont}
+\newcommand\ChapTitleFont{\reset@font\fontsize{24}{26}\slshape\selectfont}
+\newcommand\SectionHeadFont{\fontsize{12}{14}\bfseries\selectfont}
+\newcommand\SubsectionHeadFont{\fontsize{11}{13}\bfseries\selectfont}
+\newcommand\SubsubsectionHeadFont{\fontsize{10}{12}\bfseries\selectfont}
+\newcommand\ParagraphHeadFont{\fontsize{10}{12}\itshape\selectfont}
+\newcommand\SubParagraphHeadFont{\fontsize{10}{12}\itshape\selectfont}
+\newcommand\FMHeadFont{\reset@font\fontsize{18}{20}\slshape\bfseries\selectfont}
+\newcommand\RunningHeadFont{\fontsize{10}{12}\itshape\selectfont}
+\newcommand\NameFont{\fontsize{14}{18}\itshape\selectfont}
+\newcommand\EdFont{\fontsize{14}{18}\selectfont}
+\newcommand\AffiliationFont{\fontsize{8}{10}\selectfont}
+\newcommand\FigCapFont{\fontsize{10}{12}\bfseries\selectfont}
+\newcommand\FigCapBIFont{\fontsize{10}{12}\bfseries\itshape\selectfont}
+\newcommand\TableColHeadFont{\fontsize{10}{12}\bfseries\selectfont}
+\newcommand\TableTitleFont{\fontsize{10}{12}\selectfont}
+\newcommand\TableNumberFont{\fontsize{11}{13}\bfseries\selectfont}
+\newcommand\TableBodyFont{\reset@font\fontsize{9}{11}\selectfont}
+\newcommand\TableSubheadFont{\reset@font\fontsize{9}{11}\selectfont}
+\newcommand\TableFootnoteFont{\reset@font\fontsize{8}{10}\selectfont}
+\newcommand\CAPlusOneFont{\fontsize{10}{12}\bfseries\selectfont}
+\newcommand\CAAPlusOneFont{\fontsize{10}{12}\itshape\selectfont}
+\newcommand\tocfont{\fontsize{10}{12}\selectfont}
+\newcommand\extraFont{\fontsize{24}{28}\selectfont}
+\newcommand\VfFont{\fontsize{10}{12}\selectfont}
+
+%%%%%%%%%%%%%%%%%
+
+
+
+\input{bk1\@ptsize.clo}
+\setlength\lineskip{1\p@}
+\setlength\normallineskip{1\p@}
+\renewcommand\baselinestretch{}
+\setlength\parskip{0\p@ \@plus \p@}
+\@lowpenalty   51
+\@medpenalty  151
+\@highpenalty 301
+\setcounter{topnumber}{2}
+\renewcommand\topfraction{.7}
+\setcounter{bottomnumber}{1}
+\renewcommand\bottomfraction{.3}
+\setcounter{totalnumber}{3}
+\renewcommand\textfraction{.2}
+\renewcommand\floatpagefraction{.5}
+\setcounter{dbltopnumber}{2}
+\renewcommand\dbltopfraction{.7}
+\renewcommand\dblfloatpagefraction{.5}
+
+
+%  ****************************************
+%  *            PAGE LAYOUT               *
+%  ****************************************
+%
+% All margin dimensions measured from a point one inch from top and side
+% of page.
+%
+% SIDE MARGINS:
+%
+\oddsidemargin  6pc %5pc
+\evensidemargin 5.7pc %5pc
+\marginparwidth 4pc
+\marginparsep   1pc
+\topmargin  12pt %0pt
+\headheight 12pt
+\headsep    12pt
+\footskip   2pc
+%
+% DIMENSION OF TEXT:
+\newdimen\trimheight
+\newdimen\trimwidth
+\newdimen\normaltextheight
+\newdimen\tempa
+\newdimen\tempdimen
+%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  Parameter Initializaton %%%%%%%%%%%%%%%%%%%%%%%%%%
+% 
+\newdimen\htrim
+\newdimen\vtrimtop
+\newdimen\vtrimbot
+
+\setlength\trimheight{9.21in}
+\setlength\trimwidth{6.14in}
+%
+
+%
+\if@krantza
+\textheight = 45pc
+  %\advance\textheight by \topskip
+\addtolength\textheight{3pt}
+  \textwidth 28pc
+\addtolength\textwidth{.5pt}
+  \topmargin0in
+  \oddsidemargin1.1875in
+  \evensidemargin1.1875in
+  \htrim.7365in
+  \vtrimtop1.068in
+  \vtrimbot1.068in
+  \hoffset-15pt
+  \voffset39pt
+\let\normaltextheight\textheight
+\else\if@krantzb
+  \textheight = 51pc
+%  \advance\textheight by \topskip
+  \textwidth 33pc
+  \topmargin0in
+  \oddsidemargin.5in
+  \evensidemargin.5in
+  \htrim.75in
+  \vtrimtop.8607in
+  \vtrimbot1.027in
+  \hoffset-.1in
+  \voffset-.15in%.04in
+\let\normaltextheight\textheight
+\else
+%%%Uncomment to get 6x9 trim
+%%%%\textheight = 43pc
+%%%%  %\advance\textheight by \topskip
+%%%%\addtolength\textheight{3pt}
+%%%%  \textwidth 26pc
+%%%%\addtolength\textwidth{.5pt}
+%%%% \topmargin0in
+%%%%  \oddsidemargin1.1875in
+%%%%  \evensidemargin1.1875in
+%%%%  \htrim5.05pc
+%%%%  \vtrimtop7.7pc
+%%%%  \vtrimbot5.44pc
+%%%%%  \hoffset-5pt
+%%%%  \voffset45pt
+%%%%\let\normaltextheight\textheight
+\textheight = 45pc
+  %\advance\textheight by \topskip
+\addtolength\textheight{3pt}
+  \textwidth 28pc
+\addtolength\textwidth{.5pt}
+  \topmargin0in
+  \oddsidemargin1.1875in
+  \evensidemargin1.1875in
+  \htrim.7365in
+  \vtrimtop1.068in
+  \vtrimbot1.068in
+  \hoffset-15pt
+  \voffset39pt
+\let\normaltextheight\textheight
+
+  \fi
+\fi
+%
+\columnsep 1pc
+\columnseprule 0pt
+%
+% FOOTNOTES
+%
+\footnotesep 6.65pt
+\skip\footins 12pt plus 3pt minus 1.5pt
+%
+
+%%%%  Trim marks %%%%%%%%%%%
+\newsavebox\ul@box
+\newsavebox\ur@box
+\newsavebox\ll@box
+\newsavebox\lr@box
+\def\top@cornermarks{%
+  \hskip-\htrim
+  \vbox to 0\p@{\vskip-\vtrimtop\llap{\copy\ul@box}\vss}%
+  \vbox to 0\p@{\vskip-\vtrimtop\rlap{\hskip\textwidth\hskip2\htrim\copy\ur@box}\vss}%
+  \vbox to 0\p@{\vskip\textheight\vskip\vtrimbot\llap{\copy\ll@box}\vss}%
+  \vbox to 0\p@{\vskip\textheight\vskip\vtrimbot\rlap{\hskip\textwidth\hskip2\htrim\copy\lr@box}\vss}%
+  \hskip\htrim}
+\def\make@cornermarks{%
+  \sbox\ul@box{\rule{18\p@}{.25\p@}\hskip8\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}}%
+  \sbox\ur@box{\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}\hskip8\p@\rule{18\p@}{.25\p@}}%
+  \sbox\ll@box{\rule{18\p@}{.25\p@}\hskip8\p@\lower34\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}}%
+  \sbox\lr@box{\lower34\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}\hskip8\p@\rule{18\p@}{.25\p@}}}
+
+%%%%%%%%%%%%%%%%%%%%  End Trim Marks %%%%%%%%%%%%
+
+
+\def\ps@plain{\let\@mkboth\@gobbletwo
+     \let\@oddhead\top@cornermarks%\@empty
+     \def\@oddfoot{\reset@font\hfil\thepage
+     \hfil}\let\@evenhead\@empty\let\@evenfoot\@oddfoot}
+
+%%%%
+
+
+\if@twoside
+  \def\ps@headings{%
+  \let\@mkboth\@gobbletwo
+      \let\@oddfoot\@empty\let\@evenfoot\@empty
+      \def\@evenhead{\top@cornermarks{\PageNumFont\thepage}\hfil\RunningHeadFont\leftmark}%
+      \def\@oddhead{\top@cornermarks{\RunningHeadFont\rightmark}\hfil{\PageNumFont\thepage}}%
+      \let\@mkboth\markboth
+%    \def\chaptermark##1{%
+%      \markboth {{%\MakeUppercase
+%        \ifnum \c@secnumdepth >\m@ne
+%          \if@mainmatter
+%%            \@chapapp\ \thechapter. \ %
+%          \fi
+%        \fi
+%        ##1}}{}}%
+%    \def\sectionmark##1{%
+%      \markright {{%\MakeUppercase
+%        \ifnum \c@secnumdepth >\z@
+%%          \thesection. \ %
+%        \fi
+%        ##1}}}
+        }
+\else
+  \def\ps@headings{%
+    \let\@oddfoot\@empty
+    \def\@oddhead{{\slshape\rightmark}\hfil\thepage}%
+    \let\@mkboth\markboth
+    \def\chaptermark##1{%
+      \markright {{%\MakeUppercase
+        \ifnum \c@secnumdepth >\m@ne
+          \if@mainmatter
+            \@chapapp\ \thechapter. \ %
+          \fi
+        \fi
+        ##1}}}}
+\fi
+\def\ps@myheadings{%
+    \let\@oddfoot\@empty\let\@evenfoot\@empty
+    \def\@evenhead{\thepage\hfil\slshape\leftmark}%
+    \def\@oddhead{{\slshape\rightmark}\hfil\thepage}%
+    \let\@mkboth\@gobbletwo
+    \let\chaptermark\@gobble
+    \let\sectionmark\@gobble
+    }
+    
+    
+\def\ps@empty{%
+  \let\@mkboth\@gobbletwo
+  \if@pdf
+    \let\@evenhead\@empty
+    \let\@oddhead\@empty
+    \def\@oddfoot{\@cip\hfil}%
+    \def\@evenfoot{\@cip\hfil}%
+  \else
+    \make@cornermarks
+    \let\@oddhead\top@cornermarks
+    \let\@evenhead\top@cornermarks
+    \let\@oddfoot\@empty
+    \let\@evenfoot\@empty
+  \fi
+  }
+\def\ps@folio{%
+  \let\@mkboth\@gobbletwo\make@cornermarks
+  \if@pdf
+    \let\@evenhead\@empty
+    \let\@oddhead\@empty
+    \def\@oddfoot{\@cip\hfil}%
+    \def\@evenfoot{\@cip\hfil}%
+  \else
+\let\@oddhead\top@cornermarks
+    \def\@oddfoot{%
+      \parindent\z@
+      \baselineskip7\p@
+      \hbox{%
+        \textwidth\@ciprulewidth
+        \vbox{%
+          \if@cip\rule{\@ciprulewidth}{.25pt}\par
+            \hbox{\vbox{\noindent\copy\@cipboxa\par\noindent\copy\@cipboxb}}\fi}}
+      \hfill\thepage}
+    \let\@evenhead\top@cornermarks%\odd@head
+    \let\@evenfoot\@oddfoot
+  \fi
+  }
+\newcommand\HeadingsBookChapter{%
+  \def\chaptermark##1{%
+    \markboth{\@title}{%
+      ##1}}%
+  \def\sectionmark##1{}}
+\def\HeadingsChapterSection{%
+  \def\chaptermark##1{%
+    \markboth{%
+      ##1}{}}%
+  \def\sectionmark##1{%
+    \markright{%
+      ##1}}}
+\def\pdfon{\@pdftrue}
+\def\pdfoff{\@pdffalse}
+\if@pdf
+  \def\@cip{{\fontsize{6\p@}{8\p@}\selectfont\copyright 2001 by CRC Press LLC}}
+\else
+  \newsavebox\@cipboxa
+  \newsavebox\@cipboxb
+  \newdimen\@ciprulewidth
+  \def\@cip#1#2{%
+    \sbox\@cipboxa{\fontsize{6\p@}{8\p@}\selectfont #1}%
+    \sbox\@cipboxb{\fontsize{6\p@}{8\p@}\selectfont #2}%
+    \@ciprulewidth\wd\@cipboxa
+    \ifnum\@ciprulewidth<\wd\@cipboxb\@ciprulewidth\wd\@cipboxb\fi}%
+\fi
+\if@pdf
+\else
+  \AtBeginDocument{%
+    \@cip{\rule{0pt}{9pt}0-8493-0052-5/00/\$0.00+\$.50}%
+      {\copyright\ \ 2001 by CRC Press LLC}}%
+\fi
+
+\DeclareRobustCommand\subtitle[1]{\gdef\@subtitle{#1}}
+\DeclareRobustCommand\edition[1]{\gdef\@edition{#1}}
+\DeclareRobustCommand\editor[1]{\gdef\@editor{Edited by #1}}
+\def\@subtitle{}
+\def\@editor{}
+\def\@edition{}
+
+    
+\if@titlepage
+  \newcommand\maketitle{\begin{titlepage}%
+  \let\footnotesize\small
+  \let\footnoterule\relax
+  \let \footnote \thanks
+{\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@
+    \vbox{
+%    \crcrule
+%    \vskip 22bp
+    {\baselineskip 24bp\lineskip 24bp\TitlePageTitleFont\@title\par}
+    \vskip 27bp
+    {\baselineskip 10bp\lineskip 10bp\TitlePageSubtitleFont{\@subtitle}\par}
+    \vskip 107bp
+    {\baselineskip 10bp\lineskip 10bp\EdFont{\@edition}\par}
+    \vskip 127bp
+    {\baselineskip 10bp\lineskip 10bp\NameFont{\@author}\par}
+    {\baselineskip 10bp\lineskip 10bp\NameFont{\@editor}\par}
+    \vskip 6bp
+    \AffiliationFont \@affiliation
+    \vskip -2bp}}
+  \@thanks
+  \vfil\null
+  \end{titlepage}%
+  \setcounter{footnote}{0}%
+  \global\let\thanks\relax
+  \global\let\maketitle\relax
+  \global\let\@thanks\@empty
+  \global\let\@author\@empty
+  \global\let\@date\@empty
+%  \global\let\@title\@empty
+  \global\let\title\relax
+  \global\let\author\relax
+  \global\let\date\relax
+  \global\let\and\relax
+  }
+\else
+  \newcommand\maketitle{\par
+    \begingroup
+      \renewcommand\thefootnote{\@fnsymbol\c@footnote}%
+      \def\@makefnmark{\rlap{\@textsuperscript{\normalfont\@thefnmark}}}%
+      \long\def\@makefntext##1{\parindent 1em\noindent
+              \hb@xt@1.8em{%
+                \hss\@textsuperscript{\normalfont\@thefnmark}}##1}%
+      \if@twocolumn
+        \ifnum \col@number=\@ne
+          \@maketitle
+        \else
+          \twocolumn[\@maketitle]%
+        \fi
+      \else
+      \newpage
+        \global\@topnum\z@   % Prevents figures from going at top of page.
+        \@maketitle
+      \fi
+      \thispagestyle{empty}\@thanks
+    \endgroup
+    \setcounter{footnote}{0}%
+    \global\let\thanks\relax
+    \global\let\maketitle\relax
+    \global\let\@maketitle\relax
+    \global\let\@thanks\@empty
+    \global\let\@author\@empty
+    \global\let\@date\@empty
+    \global\let\@title\@empty
+    \global\let\title\relax
+    \global\let\author\relax
+    \global\let\date\relax
+    \global\let\and\relax
+  }
+\def\@maketitle{%
+  \newpage
+  \null
+  \vskip 2em%
+  \begin{center}%
+  \let \footnote \thanks
+    {\LARGE \@title \par}%
+    \vskip 1.5em%
+    {\large
+      \lineskip .5em%
+      \begin{tabular}[t]{c}%
+        \@author
+      \end{tabular}\par}%
+    \vskip 1em%
+    {\large \@date}%
+  \end{center}%
+  \par
+  \vskip 1.5em}
+\fi
+\newcommand*\chaptermark[1]{}
+\setcounter{secnumdepth}{3}
+\newcounter {part}
+\newcounter {chapter}
+\newcounter {section}[chapter]
+\newcounter {subsection}[section]
+\newcounter {subsubsection}[subsection]
+\newcounter {paragraph}[subsubsection]
+\newcounter {subparagraph}[paragraph]
+\renewcommand \thepart {\@Roman\c@part}
+\renewcommand \thechapter {\@arabic\c@chapter}
+\renewcommand \thesection {\thechapter.\@arabic\c@section}
+\renewcommand\thesubsection   {\thesection.\@arabic\c@subsection}
+\renewcommand\thesubsubsection{\thesubsection.\@arabic\c@subsubsection}
+\renewcommand\theparagraph    {\thesubsubsection.\@arabic\c@paragraph}
+\renewcommand\thesubparagraph {\theparagraph.\@arabic\c@subparagraph}
+\newcommand\@chapapp{\chaptername}
+
+\newbox\tempbox%
+\newdimen\tempdimen%
+
+
+\newcommand\frontmatter{%
+    \cleardoublepage
+  \@mainmatterfalse
+  \pagenumbering{roman}}
+\newcommand\mainmatter{%
+    \cleardoublepage
+  \@mainmattertrue
+  \pagenumbering{arabic}}
+\newcommand\backmatter{%
+  \if@openright
+    \cleardoublepage
+  \else
+    \clearpage
+  \fi
+  \@mainmatterfalse}
+\newcommand\part{%
+  \if@openright
+    \cleardoublepage
+  \else
+    \clearpage
+  \fi
+  \thispagestyle{empty}%
+  \if@twocolumn
+    \onecolumn
+    \@tempswatrue
+  \else
+    \@tempswafalse
+  \fi
+  \null\vfil
+  \secdef\@part\@spart}
+
+\def\@part[#1]#2{%
+    \ifnum \c@secnumdepth >-2\relax
+      \refstepcounter{part}%
+      \addcontentsline{toc}{part}{\thepart\hspace{1em}#1}%
+    \else
+      \addcontentsline{toc}{part}{#1}%
+    \fi
+    \markboth{}{}%
+    {\centering
+     \interlinepenalty \@M
+     \normalfont
+     \ifnum \c@secnumdepth >-2\relax
+       \huge\bfseries \partname\nobreakspace\thepart
+       \par
+       \vskip 20\p@
+     \fi
+     \Huge \bfseries #2\par}%
+    \@endpart}
+\def\@spart#1{%
+    {\centering
+     \interlinepenalty \@M
+     \normalfont
+     \Huge \bfseries #1\par}%
+    \@endpart}
+\def\@endpart{\vfil\newpage
+              \if@twoside
+               \if@openright
+                \null
+                \thispagestyle{empty}%
+                \newpage
+               \fi
+              \fi
+              \if@tempswa
+                \twocolumn
+              \fi}
+              
+              
+\if@ChapterTOCs
+  \newwrite\@chaptoc
+   \def\secnumwidth{21pt}\def\subsecnumwidth{30pt}\def\ssubsecnumwidth{36pt}\def\subsubsecnumwidth{66pt}\fi
+\long\def\@trplarg#1{\@ifnextchar[{\@xtrplarg{#1}}{\@ztrplarg{#1}}}
+\long\def\@xtrplarg#1[#2]{\@ifnextchar[{#1[#2]}{\@ytrplarg{#1}[{#2}]}}
+\long\def\@ytrplarg#1[#2]#3{#1[{#2}][{#2}]{#3}}
+\long\def\@ztrplarg#1#2{#1[{#2}][{#2}]{#2}}
+
+\newcommand\halftitle[2]{\cleardoublepage\thispagestyle{empty}%
+  {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@
+    \vbox{
+    \vskip -2\p@
+    \ChapNumFont
+    {\baselineskip 20\p@\lineskip 20\p@\ChapTitleFont #1\par\vskip15pt}%
+   {\normalsize\normalfont #2}
+%    \noindent\hbox{\vrule height.5pt width84pt}
+    \vskip28\p@}
+    \vskip 19.3\p@}
+    }%
+
+\newcommand\seriespg[1]{\clearpage\thispagestyle{empty}%
+  {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@
+    \vbox{
+    \vskip -2\p@
+    \ChapNumFont
+    {\baselineskip 20\p@\lineskip 20\p@\Large #1\par\vskip15pt}%
+%   {\normalsize #2}
+%    \noindent\hbox{\vrule height.5pt width84pt}
+    \vskip28\p@}
+    \vskip 19.3\p@}
+    }%
+    
+\newcommand\booktitle[2]{\cleardoublepage\thispagestyle{empty}%
+  {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@
+    \vbox{
+    \vskip -2\p@
+    \ChapNumFont
+    \vskip 6\p@
+    {\baselineskip 20\p@\lineskip 20\p@\ChapTitleFont #1\par\vskip-15pt}%
+%    \noindent\hbox{\vrule height.5pt width84pt}
+{#2}
+    \vskip28\p@}
+    \vskip 19.3\p@}
+    }%
+
+\def\locpage{\clearpage\thispagestyle{empty}%
+  {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@
+    \vbox{
+    \vskip -2\p@
+    \ChapNumFont
+    {\baselineskip 20\p@\lineskip 20\p@\Large Imprint page here; PE will provide text\par\vskip-15pt}%
+%    \noindent\hbox{\vrule height.5pt width84pt}
+    \vskip28\p@}
+    \vskip 19.3\p@}
+}%
+
+
+
+%\newif\if@afterindent \@afterindenttrue
+\def\@afterheading{%
+  \@nobreaktrue
+  \everypar{%
+    \if@nobreak
+      \@nobreakfalse
+      \clubpenalty \@M
+      \if@afterindent \else
+        {\setbox\z@\lastbox}%
+      \fi
+    \else
+      \clubpenalty \@clubpenalty
+      \everypar{}%
+    \fi}}
+
+              
+\newcommand\chapter{\if@openright\cleardoublepage\else\clearpage\fi
+\make@cornermarks
+  \cleardoublepage
+  \if@ChapterTOCs\if@filesw\immediate\closeout\@chaptoc\fi\fi
+  \pagestyle{headings}%
+                    \thispagestyle{folio}%
+\if@ChapterResetsPage\global\c@page\@ne\fi
+                    \global\@topnum\z@
+                      \gdef\chapterauthor{\@ca}%
+  \gdef\endchapterauthors{\end@cas}%
+                    \@afterindentfalse
+                    \secdef\@chapter\@schapter}
+\def\@chapter[#1]#2{\ifnum \c@secnumdepth >\m@ne
+                       \if@mainmatter
+                         \refstepcounter{chapter}%
+                         \typeout{\@chapapp\space\thechapter.}%
+                         \addcontentsline{toc}{chapter}%
+                                   {\protect\numberline{\thechapter}#1}%
+                       \else
+                         \addcontentsline{toc}{chapter}{#1}%
+                       \fi
+                    \else
+                      \addcontentsline{toc}{chapter}{#1}%
+                    \fi
+                    \chaptermark{#1}%
+                    \addtocontents{lof}{\protect\addvspace{10\p@}}%
+                    \addtocontents{lot}{\protect\addvspace{10\p@}}%
+                    \if@twocolumn
+                      \@topnewpage[\@makechapterhead{#2}]%
+                    \else
+                      \@makechapterhead{#2}%
+                      \@afterheading
+                    \fi
+  \if@ChapterTOCs\if@filesw\immediate\openout\@chaptoc\thechapter.toc\fi\fi
+                    }
+\def\@makechapterhead#1{%
+%%  \vspace*{50\p@}%
+%%  {\parindent \z@ \raggedright \normalfont
+%%    \ifnum \c@secnumdepth >\m@ne
+%%      \if@mainmatter
+%%        \huge\bfseries \@chapapp\space \thechapter
+%%        \par\nobreak
+%%        \vskip 20\p@
+%%      \fi
+%%    \fi
+%%    \interlinepenalty\@M
+%%    \Huge \bfseries #1\par\nobreak
+%%    \vskip 40\p@}
+  {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@
+    \vbox{
+    \vskip -2\p@
+    \ChapNumFont
+%Remove comment if "Chapter"  word required before Number
+%\if@chapnumonly\else
+%    \@chapapp\
+%\fi
+    \thechapter
+    \vskip -15\p@
+    \chap@rule
+    \vskip 6\p@
+    {\baselineskip 20\p@\lineskip 20\p@\ChapTitleFont #1\par\vskip-15pt}%
+    \noindent\hbox{\vrule height.5pt width84pt}
+    \vskip28\p@}
+            \if@ChapterTOCs
+      \make@chaptoc
+    \else
+\fi
+    \vskip 19.3\p@}
+    \def\theequation{\thechapter.\arabic{equation}}
+    }
+\def\@schapter#1{\if@twocolumn
+                   \@topnewpage[\@makeschapterhead{#1}]%
+                 \else
+                   \@makeschapterhead{#1}%
+                                      \addcontentsline{toc}{fm}{#1}
+                   \markboth{#1}{#1}
+                   \@afterheading
+                 \fi}
+\def\@makeschapterhead#1{%
+%%  \vspace*{50\p@}%
+%%  {\parindent \z@ \raggedright
+%%    \normalfont
+%%    \interlinepenalty\@M
+%%    \Huge \bfseries  #1\par\nobreak
+%%    \vskip 40\p@}
+  {\parindent \z@ \raggedright \baselineskip 6\p@ \lineskip \z@ \parskip \z@
+    \vbox{
+    \vskip 22\p@
+    \unnumchap@rule
+    \vskip 5\p@
+    \FMHeadFont #1\par\vskip-12pt
+    \noindent\hbox{\vrule height.5pt width84pt}
+    \vskip 41\p@}}%
+  \def\theequation{\thechapter.\arabic{equation}}
+  }
+%%\newcommand\section{\@startsection {section}{1}{\z@}%
+%%                                   {-3.5ex \@plus -1ex \@minus -.2ex}%
+%%                                   {2.3ex \@plus.2ex}%
+%%                                   {\normalfont\Large\bfseries}}
+%%\newcommand\subsection{\@startsection{subsection}{2}{\z@}%
+%%                                     {-3.25ex\@plus -1ex \@minus -.2ex}%
+%%                                     {1.5ex \@plus .2ex}%
+%%                                     {\normalfont\large\bfseries}}
+%%\newcommand\subsubsection{\@startsection{subsubsection}{3}{\z@}%
+%%                                     {-3.25ex\@plus -1ex \@minus -.2ex}%
+%%                                     {1.5ex \@plus .2ex}%
+%%                                     {\normalfont\normalsize\bfseries}}
+%%\newcommand\paragraph{\@startsection{paragraph}{4}{\z@}%
+%%                                    {3.25ex \@plus1ex \@minus.2ex}%
+%%                                    {-1em}%
+%%                                    {\normalfont\normalsize\bfseries}}
+%%\newcommand\subparagraph{\@startsection{subparagraph}{5}{\parindent}%
+%%                                       {3.25ex \@plus1ex \@minus .2ex}%
+%%                                       {-1em}%
+%%                                      {\normalfont\normalsize\bfseries}}
+
+
+
+%%Change mydotted also
+\newdimen\secwd
+\newdimen\subsecwd
+\newdimen\subsubsecwd
+
+\def\secwd{31pt}
+\def\subsecwd{36pt}
+\def\subsubsecwd{46pt}
+
+
+\def\ssubnumberline#1{\@hangfrom{\hbox to \secwd{#1\hfill}}}
+\def\subnumberline#1{\@hangfrom{\hskip\subsecnumwidth\hbox to \subsecwd{#1\hfill}}}
+\def\subsubnumberline#1{\@hangfrom{\hskip\subsubsecnumwidth\hbox to \subsubsecwd{#1\hfill}}}
+
+
+\newcommand\section{%
+  \gdef\chapterauthor{\@caplusone}%
+  \gdef\endchapterauthors{\end@casplusone}%
+  \@ifstar{\@ssection}{\@trplarg{\@section}}}
+\def\@ssection#1{%
+  \if@ChapterTOCs
+    \myaddcontentsline{\@chaptoc}{chapsection}{\protect\ssubnumberline{}#1}\fi
+  \@startsection{section}{1}{\z@}{-30\p@}{6\p@}{\sec@rule\nopagebreak\vskip9.5\p@\nopagebreak\SectionHeadFont}*{#1}}
+\def\@section[#1][#2]#3{%
+  \if@ChapterTOCs
+    \addtocounter{section}{1}%
+        \myaddcontentsline{\@chaptoc}{chapsection}{\protect\ssubnumberline{\thesection}#1}%
+    \addtocounter{section}{-1}\fi
+  \@startsection{section}{1}{\z@}{-30\p@}{6\p@}{\sec@rule\nopagebreak\vskip9.5\p@\nopagebreak\SectionHeadFont\ignorespaces\noindent}[#2]{#3}}
+\def\sectionauthor#1{\hfill{\ChapTOCAuthorFont #1}}
+
+\newcommand\subsection{\@ifstar{\@ssubsection}{\@trplarg{\@subsection}}}
+\def\@ssubsection#1{%
+  \if@ChapterTOCs
+    \myaddcontentsline{\@chaptoc}{chapsubsection}{\protect\subnumberline{ }#1}\fi
+  \@startsection{subsection}{2}{\z@}{-18\p@}{6\p@}{\SubsectionHeadFont}*{#1}}
+\def\@subsection[#1][#2]#3{%
+  \if@ChapterTOCs
+    \addtocounter{subsection}{1}%
+        \myaddcontentsline{\@chaptoc}{chapsubsection}{\protect\subnumberline{\thesubsection}#1}%
+    \addtocounter{subsection}{-1}\fi
+  \@startsection{subsection}{2}{\z@}{-18\p@}{6\p@}{\ignorespaces\noindent\SubsectionHeadFont}[#2]{#3}}
+
+\newcommand\subsubsection{\@ifstar{\@ssubsubsection}{\@trplarg{\@subsubsection}}}
+\def\@ssubsubsection#1{%
+  \if@ChapterTOCs
+    \myaddcontentsline{\@chaptoc}{chapsubsubsection}{\protect\subsubnumberline{}#1}\fi
+  \@startsection{subsubsection}{3}{\z@}{-12\p@}{6\p@}{%
+    \SubsubsectionHeadFont}*{#1}}
+\def\@subsubsection[#1][#2]#3{%
+  \if@ChapterTOCs
+    \addtocounter{subsubsection}{1}%
+        \myaddcontentsline{\@chaptoc}{chapsubsubsection}{\protect\subsubnumberline{\thesubsubsection}#1}%
+    \addtocounter{subsubsection}{-1}\fi
+  \@startsection{subsubsection}{3}{\z@}{-12\p@}{6\p@}{%
+    \SubsubsectionHeadFont}[#2]{#3}}
+
+\newcommand\paragraph{\@startsection{paragraph}{4}{\z@}%
+{-12\p@}{6\p@}{\ParagraphHeadFont}}
+
+\newcommand\subparagraph{\@startsection{subparagraph}{5}{\parindent}%
+{-12\p@}{6\p@}{\SubParagraphHeadFont}}
+
+
+
+\if@twocolumn
+  \setlength\leftmargini  {2em}
+\else
+  \setlength\leftmargini  {2.5em}
+\fi
+\leftmargin  \leftmargini
+\setlength\leftmarginii  {2.2em}
+\setlength\leftmarginiii {1.87em}
+\setlength\leftmarginiv  {1.7em}
+\if@twocolumn
+  \setlength\leftmarginv  {.5em}
+  \setlength\leftmarginvi {.5em}
+\else
+  \setlength\leftmarginv  {1em}
+  \setlength\leftmarginvi {1em}
+\fi
+\setlength  \labelsep  {.5em}
+\setlength  \labelwidth{\leftmargini}
+\addtolength\labelwidth{-\labelsep}
+\@beginparpenalty -\@lowpenalty
+\@endparpenalty   -\@lowpenalty
+\@itempenalty     -\@lowpenalty
+\renewcommand\theenumi{\@arabic\c@enumi}
+\renewcommand\theenumii{\@alph\c@enumii}
+\renewcommand\theenumiii{\@roman\c@enumiii}
+\renewcommand\theenumiv{\@Alph\c@enumiv}
+\newcommand\labelenumi{\theenumi.}
+\newcommand\labelenumii{(\theenumii)}
+\newcommand\labelenumiii{\theenumiii.}
+\newcommand\labelenumiv{\theenumiv.}
+\renewcommand\p@enumii{\theenumi}
+\renewcommand\p@enumiii{\theenumi(\theenumii)}
+\renewcommand\p@enumiv{\p@enumiii\theenumiii}
+\newcommand\labelitemi  {\labelitemfont \textbullet}
+\newcommand\labelitemii {\labelitemfont \bfseries \textendash}
+\newcommand\labelitemiii{\labelitemfont \textasteriskcentered}
+\newcommand\labelitemiv {\labelitemfont \textperiodcentered}
+\newcommand\labelitemfont{\normalfont}
+\newenvironment{description}
+               {\list{}{\labelwidth\z@ \itemindent-\leftmargin
+                        \let\makelabel\descriptionlabel}}
+               {\endlist}
+\newcommand*\descriptionlabel[1]{\hspace\labelsep
+                                \normalfont\bfseries #1}
+\newenvironment{verse}
+               {\let\\\@centercr
+                \list{}{\itemsep      \z@
+                        \itemindent   -1.5em%
+                        \listparindent\itemindent
+                        \rightmargin  \leftmargin
+                        \advance\leftmargin 1.5em}%
+                \item\relax}
+               {\endlist}
+\newenvironment{quotation}
+               {\list{}{\listparindent 1.5em%
+                        \itemindent    \listparindent
+                        \rightmargin   \leftmargin
+                        \parsep        \z@ \@plus\p@}%
+                \item\relax}
+               {\endlist}
+\newenvironment{quote}
+               {\list{}{\rightmargin\leftmargin}%
+                \item\relax}
+               {\endlist}
+\if@compatibility
+  \newenvironment{titlepage}
+    {%
+      \cleardoublepage
+      \if@twocolumn
+        \@restonecoltrue\onecolumn
+      \else
+        \@restonecolfalse\newpage
+      \fi
+      \thispagestyle{empty}%
+      \setcounter{page}\z@
+    }%
+    {\if@restonecol\twocolumn \else \newpage \fi
+    }
+\else
+  \newenvironment{titlepage}
+    {%
+      \cleardoublepage
+      \if@twocolumn
+        \@restonecoltrue\onecolumn
+      \else
+        \@restonecolfalse\newpage
+      \fi
+      \thispagestyle{empty}%
+      \setcounter{page}\@ne
+    }%
+    {\if@restonecol\twocolumn \else \newpage \fi
+     \if@twoside\else
+        \setcounter{page}\@ne
+     \fi
+    }
+\fi
+\newcommand\appendix{\par
+  \setcounter{chapter}{0}%
+  \setcounter{section}{0}%
+  \gdef\@chapapp{\appendixname}%
+  \gdef\thechapter{\@Alph\c@chapter}}
+\setlength\arraycolsep{5\p@}
+\setlength\tabcolsep{6\p@}
+\setlength\arrayrulewidth{.4\p@}
+\setlength\doublerulesep{2\p@}
+\setlength\tabbingsep{\labelsep}
+\skip\@mpfootins = \skip\footins
+\setlength\fboxsep{3\p@}
+\setlength\fboxrule{.4\p@}
+\@addtoreset {equation}{chapter}
+\renewcommand\theequation
+  {\ifnum \c@chapter>\z@ \thechapter.\fi \@arabic\c@equation}
+\newcounter{figure}[chapter]
+\renewcommand \thefigure
+     {\ifnum \c@chapter>\z@ \thechapter.\fi \@arabic\c@figure}
+\def\fps@figure{tbp}
+\def\ftype@figure{1}
+\def\ext@figure{lof}
+\def\fnum@figure{\figurename\nobreakspace\thefigure}
+\newenvironment{figure}
+               {\@float{figure}}
+               {\end@float}
+\newenvironment{figure*}
+               {\@dblfloat{figure}}
+               {\end@dblfloat}
+\newcounter{table}[chapter]
+\renewcommand \thetable
+     {\ifnum \c@chapter>\z@ \thechapter.\fi \@arabic\c@table}
+\def\fps@table{tbp}
+\def\ftype@table{2}
+\def\ext@table{lot}
+\def\fnum@table{\tablename\nobreakspace\thetable}
+\newenvironment{table}
+               {\@float{table}}
+               {\end@float}
+\newenvironment{table*}
+               {\@dblfloat{table}}
+               {\end@dblfloat}
+\newlength\abovecaptionskip
+\newlength\belowcaptionskip
+\setlength\abovecaptionskip{10\p@}
+\setlength\belowcaptionskip{0\p@}
+
+\def\Fig{figure}
+\long\def\@makecaption#1#2{%
+  \vskip\abovecaptionskip
+  \ifx\@captype\Fig
+   \sbox\@tempboxa{{\FigCapFont #1}\par #2}%
+  \ifdim \wd\@tempboxa >\hsize
+    {\FigCapFont #1}\par #2\par
+  \else
+%    \global \@minipagefalse
+%    \hb@xt@\hsize{\hfil\box\@tempboxa\hfil}%
+    {\FigCapFont #1}\par #2\par
+  \fi
+  \else
+  \let\caption\tabletitle
+%  \@tablecaption{#1}{#2}
+  \fi
+  \vskip\belowcaptionskip}
+
+%%\def\@tablecaption#1#2{%
+%%{\FigCapFont#1}\hskip6.5pt{#2}\par\vskip-6pt
+%%\noindent\vbox{\hrule width\textwidth height.5pt}
+%%}  
+
+\tabcolsep 5pt
+\arrayrulewidth .5pt
+\doublerulesep 1pt
+%\newcounter{subtable}[table]
+\newif\if@tablerules\@tablerulestrue
+\newif\if@centertable\@centertabletrue
+\newif\if@centertabletitle\@centertabletitletrue
+\newbox\@tablebox
+\newbox\@tabletitlebox
+\newdimen\@tablewidth
+\newdimen\@tabletitlewidth
+\newdimen\max@tablewidth
+\newcommand\automaticrules{\@tablerulestrue}
+\newcommand\noautomaticrules{\@tablerulesfalse}
+\def\thetable{%
+\thechapter.%
+\@arabic\c@table}
+\def\thesubtable{%
+\thechapter.%
+\@arabic\c@table\alph{subtable}}
+\def\resettableletter{\setcounter{subtable}{0}}
+\def\@Tabletitle{}
+\newcommand\tabletitle{\@ifnextchar[{\@xtabletitle}{\@tabletitlewidth\z@\@ytabletitle}}
+\def\@@tabletitle{}
+\newif\ifshorttabletitle
+\global\shorttabletitlefalse
+%\def\@xtabletitle#1{\@tabletitlewidth#1\@ytabletitle}
+%
+\def\@xtabletitle[#1]#2{%
+  \gdef\@@tabletitle{#1}%
+  \gdef\@tabletitle{#2}%
+  \let\@Tabletitle\@TableTitle
+  \refstepcounter{table}%
+  {\let\footnotemark\@empty
+    \let\footnote\@gobble
+    \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{\@@tabletitle}}}}
+%%%%
+%\long\def\@xtabletitle[#1]#2{%
+%  \setbox\@ttbox\hbox{#1}\global\shorttabletitletrue
+%  \def\@@tabletitle{\ifx\@ttbox\@empty\else#1\fi}%
+%  \def\@tabletitle{#2}%
+%  \let\@Tabletitle\@TableTitle
+%  \refstepcounter{table}%
+%  {\let\footnotemark\@empty
+%    \let\footnote\@gobble
+%    \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{%
+%\ifshorttabletitle\@@tabletitle\else\@tabletitle\fi}}}}
+
+%%%
+%
+\long\def\@ytabletitle#1{%
+  \def\@tabletitle{#1}%
+  \let\@Tabletitle\@TableTitle
+  \refstepcounter{table}%
+  {\let\footnotemark\@empty
+    \let\footnote\@gobble
+    \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{\@tabletitle}}}}
+\def\tabletitlelet{\@ifnextchar[{\@xtabletitlelet}{\@tabletitlewidth\z@\@ytabletitlelet}}
+\def\@xtabletitlelet[#1]{\@tabletitlewidth#1\@ytabletitlelet}
+\long\def\@ytabletitlelet#1{%
+  \def\@tabletitle{#1}%
+  \let\@Tabletitle\@TableTitle
+  \ifnum\c@subtable=0\stepcounter{table}\fi
+  \let\@currentlabel\thesubtable
+  {\let\footnotemark\@empty
+    \let\footnote\@gobble
+    \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{\@tabletitle}}}}
+\def\@TableTitle{%
+  \noindent
+  {%
+    \vbox{{\TableNumberFont TABLE\ \thetable}}\par\TableTitleFont\@tabletitle}}
+\def\table{%
+  %\long\def\caption##1{\tabletitle{##1}\@TableTitle\par}%
+  \@float{table}}
+\@namedef{table*}{%
+  \long\def\caption##1{\tabletitle{##1}\@TableTitle\par}%
+  \@dblfloat{table}}
+
+\def\@tabular{%
+  \leavevmode
+  \if@centertable\hfil\fi
+  \vbox\bgroup
+  \setbox\@tablebox\hbox\bgroup
+  \baselineskip11pt
+  \global\let\@fn\@empty
+  \def\footnote##1{\footnotemark\gdef\@fn{##1}}
+  \renewcommand{\arraystretch}{.916666666667}%
+  $\let\@acol\@tabacol
+  \let\@classz\@tabclassz
+  \let\@classiv\@tabclassiv
+  \let\\\@tabularcr
+  \@tabarray}
+\def\endtabular{%
+  \crcr\egroup\egroup $\egroup
+  \@tablewidth\wd\@tablebox
+  \ifnum\@tabletitlewidth>0
+    {\hsize\@tabletitlewidth\raggedright\global\setbox\@tabletitlebox\vbox{\@Tabletitle}}%
+  \else
+    \setbox\@tabletitlebox\hbox{\@Tabletitle}%
+    \ifnum\wd\@tabletitlebox>\@tablewidth
+      {\hsize\@tablewidth\raggedright\global\setbox\@tabletitlebox\vbox{\@Tabletitle}}\fi
+    \@tabletitlewidth\wd\@tabletitlebox\fi
+  \ifnum\@tabletitlewidth>0
+    \ifnum\@tabletitlewidth>\@tablewidth\@tablewidth\@tabletitlewidth\fi
+    \hbox to\@tabletitlewidth{\if@centertabletitle\hfil\fi\box\@tabletitlebox\hfil}\par\fi
+  \max@tablewidth\@tablewidth
+  \ifnum\@tabletitlewidth>\max@tablewidth\max@tablewidth\@tabletitlewidth\fi
+  \if@tablerules
+    \ifnum\@tabletitlewidth>0\vskip-6pt\fi
+    \hbox to\max@tablewidth{\if@centertable\hfil\fi\rule{\@tablewidth}{1pt}\hfil}\par\fi
+  \hbox to\max@tablewidth{\if@centertable\hfil\fi\box\@tablebox\hfil}\vskip1pt
+  \if@tablerules\hbox to\max@tablewidth{\if@centertable\hfil\fi\rule{\@tablewidth}{1pt}\hfil}\par\fi
+  \ifx\@fn\@empty\else\FootnoteFont\parindent\z@\noindent\@makefnmark\@fn\par\fi
+  \egroup\hfil
+  \vskip 0pt plus 12pt
+  \gdef\@Tabletitle{}}
+
+
+  
+  
+\DeclareOldFontCommand{\rm}{\normalfont\rmfamily}{\mathrm}
+\DeclareOldFontCommand{\sf}{\normalfont\sffamily}{\mathsf}
+\DeclareOldFontCommand{\tt}{\normalfont\ttfamily}{\mathtt}
+\DeclareOldFontCommand{\bf}{\normalfont\bfseries}{\mathbf}
+\DeclareOldFontCommand{\it}{\normalfont\itshape}{\mathit}
+\DeclareOldFontCommand{\sl}{\normalfont\slshape}{\@nomath\sl}
+\DeclareOldFontCommand{\sc}{\normalfont\scshape}{\@nomath\sc}
+\DeclareRobustCommand*\cal{\@fontswitch\relax\mathcal}
+\DeclareRobustCommand*\mit{\@fontswitch\relax\mathnormal}
+\newcommand\@pnumwidth{1.55em}
+\newcommand\@tocrmarg{2.55em}
+\newcommand\@dotsep{4.5}
+\setcounter{tocdepth}{3}
+
+
+\newcounter{numauthors}
+\newif\if@break
+\newif\if@firstauthor
+\newcommand\tableofcontents{\cleardoublepage\markboth{Contents}{Contents}%%
+    \gdef\chapterauthor{\@caplusone}%
+  \gdef\endchapterauthors{\end@casplusone}%
+    \if@twocolumn
+      \@restonecoltrue\onecolumn
+    \else
+      \@restonecolfalse
+    \fi
+      {\parindent \z@ \raggedright \baselineskip 6\p@ \lineskip \z@ \parskip \z@
+    \vbox{
+    \vskip 22\p@
+    \unnumchap@rule
+    \vskip 5\p@
+    \FMHeadFont \contentsname\par\vskip-12pt
+    \noindent\hbox{\vrule height.5pt width84pt}
+    \vskip 41\p@}}
+%%    \chapter*{\contentsname
+%%        \@mkboth{%\MakeUppercase
+%%           \contentsname}{\contentsname}}%\MakeUppercase
+           \pagestyle{headings}\thispagestyle{folio}
+             {\let\break\space
+    \let\author\toc@author
+    \reset@authors
+    \let\toc@draw\relax
+    \@starttoc{toc}
+    \toc@draw
+    }
+    \if@restonecol\twocolumn\fi
+    }
+
+%%\newcounter{numauthors}
+%%\newif\if@break
+%%\newif\if@firstauthor
+%%\newcommand\tableofcontents{\cleardoublepage\markboth{Contents}{Contents}%
+%%  \make@cornermarks
+%%    \gdef\chapterauthor{\@caplusone}%
+%%  \gdef\endchapterauthors{\end@casplusone}%
+%%    \if@twocolumn
+%%      \@restonecoltrue\onecolumn
+%%    \else
+%%      \@restonecolfalse
+%%    \fi
+%%      {\parindent \z@ \raggedright \baselineskip 6\p@ \lineskip \z@ \parskip \z@
+%%    \vbox{
+%%    \vskip 22\p@
+%%    \unnumchap@rule
+%%    \vskip 5\p@
+%%    \FMHeadFont \contentsname\par\vskip-12pt
+%%    \noindent\hbox{\vrule height.5pt width84pt}
+%%    \vskip 41\p@}}
+%%%%%    \chapter*{\contentsname
+%%%%%        \@mkboth{%
+%%%%%           \MakeUppercase\contentsname}{\MakeUppercase\contentsname}}%
+%%           \pagestyle{headings}\thispagestyle{folio}
+%%             {\let\break\space
+%%    \let\author\toc@author
+%%    \reset@authors
+%%    \let\toc@draw\relax
+%%    \@starttoc{toc}
+%%    \toc@draw
+%%    }
+%%    \if@restonecol\twocolumn\fi
+%%    }
+
+
+
+\newcommand*\l@part[2]{%
+  \ifnum \c@tocdepth >-2\relax
+    \addpenalty{-\@highpenalty}%
+    \addvspace{2.25em \@plus\p@}%
+    \setlength\@tempdima{3em}%
+    \begingroup
+      \parindent \z@ \rightskip \@pnumwidth
+      \parfillskip -\@pnumwidth
+      {\leavevmode
+       \large \bfseries #1\hfil
+       \hb@xt@\@pnumwidth{\hss #2%
+                          \kern-\p@\kern\p@}}\par
+       \nobreak
+         \global\@nobreaktrue
+         \everypar{\global\@nobreakfalse\everypar{}}%
+    \endgroup
+  \fi}
+
+\newcommand*\l@fm[2]{%
+  \ifnum \c@tocdepth >\m@ne
+    \addpenalty{-\@highpenalty}%
+    \vskip 1.0em \@plus\p@
+    \setlength\@tempdima{1.5em}%
+    \begingroup
+      \parindent \z@ \rightskip \@pnumwidth
+      \parfillskip -\@pnumwidth
+      \leavevmode \bfseries
+      \advance\leftskip\@tempdima
+      \hskip -\leftskip
+      #1\nobreak\hfil
+      \nobreak\hb@xt@\@pnumwidth{\hss #2%
+                                 \kern-\p@\kern\p@}\par
+      \penalty\@highpenalty
+    \endgroup
+  \fi}
+  
+\newcommand*\l@chapter[2]{%
+  \ifnum \c@tocdepth >\m@ne
+    \addpenalty{-\@highpenalty}%
+    \vskip 1.0em \@plus\p@
+    \setlength\@tempdima{1.5em}%
+    \begingroup
+      \parindent \z@ \rightskip \@pnumwidth
+      \parfillskip -\@pnumwidth
+      \leavevmode \bfseries
+      \advance\leftskip\@tempdima
+      \hskip -\leftskip
+      #1\nobreak\hfil
+      \nobreak\hb@xt@\@pnumwidth{\hss #2%
+                                 \kern-\p@\kern\p@}\par
+\if@ChapterTOCs {\it\draw@authors}\fi%
+      \penalty\@highpenalty
+    \endgroup
+  \fi}
+
+\def\toc@author#1#2{%
+  \if@firstauthor
+    \@firstauthorfalse
+  \else
+    \ifx\@authors\@empty
+      \xdef\@authors{\last@author}%
+    \else
+      \@cons{\@authors}{, \last@author}\fi\fi
+  \stepcounter{numauthors}%
+%%%%%%% commented and deleted below the second part to aviod inaccessible error % shashi % September-2008
+%%  \gdef\last@author{#1 {\rm\fontsize{9\p@}{11\p@}\selectfont #2}}
+\gdef\last@author{#1}
+}
+\def\draw@authors{%
+  \let\@t\@authors
+  \ifx\@t\@empty
+    \let\@t\last@author\fi
+  \ifx\@t\@empty\else
+    \hskip\leftskip
+    \ifx\@authors\@empty
+    \else
+      \@authors
+      \ifnum\c@numauthors>2,\fi
+      \if@break\break\fi
+      \ and \fi
+    \last@author\break\fi
+  \reset@authors}
+\def\reset@authors{%
+  \gdef\@authors{}%
+  \gdef\last@author{}%
+  \@firstauthortrue
+  \setcounter{numauthors}{0}}
+
+
+\newcommand*\l@section{\@dottedtocline{1}{1.5em}{2.3em}}
+\newcommand*\l@subsection{\@dottedtocline{2}{3.8em}{3.2em}}
+\newcommand*\l@subsubsection{\@dottedtocline{3}{7.0em}{4.1em}}
+\newcommand*\l@paragraph{\@dottedtocline{4}{10em}{5em}}
+\newcommand*\l@subparagraph{\@dottedtocline{5}{12em}{6em}}
+
+\def\@dottedtocline#1#2#3#4#5{%
+  \ifnum #1>\c@tocdepth
+  \else
+    \vskip \z@ \@plus.2\p@
+    {\leftskip #2\relax\rightskip\@tocrmarg\parfillskip-\rightskip
+      \parindent #2\relax\@afterindenttrue
+      \interlinepenalty\@M
+      \leavevmode
+      \@tempdima #3\relax
+      \advance\leftskip\@tempdima\null\hskip-\leftskip
+      {#4\hfil}\nobreak
+      \if@pdf
+      \else
+        \leaders\hbox{$\m@th\mkern\@dotsep mu\hbox{.}\mkern\@dotsep mu$}\hfill
+        \nobreak
+        \hb@xt@\@pnumwidth{\hfil\normalfont\normalcolor #5}%
+\fi
+      \par}\fi}
+      
+      
+      \newcommand\chapterauthors{%
+  \def\break{\string\break\ }%
+  \def\protect##1{\string ##1 }}
+\def\end@cas{}
+\def\end@casplusone{\vskip4pt\@doendpe}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
+\def\make@chaptoc{% chapter author % 
+  {\parindent\z@
+  \def\FolioBoldFont{}%
+  \let\@b\bullet
+  \def\bullet{\raisebox{2pt}{$\scriptscriptstyle\@b$}}%
+  \let\SubsectionItalicFont\it
+%\ifx\chapter@author\@empty\else
+{\rm\fontsize{10\p@}{10\p@}\bfseries\selectfont
+%\the\c@numauthors
+    \ifnum\c@numauthors=1
+        \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+            \fi
+        \ifnum\c@numauthors=2
+                    \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                        \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}
+                            \fi
+                                \ifnum\c@numauthors=3
+                                    \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                                        \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@
+                                            \chapter@authorthree\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}
+                                                \fi
+                                                    \ifnum\c@numauthors=4
+                                                        \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                                                            \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@
+                                                                \chapter@authorthree\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@
+                                                                    \chapter@authorfour\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour}
+                                                                        \fi
+                                                    \ifnum\c@numauthors=5
+                                                        \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                                                            \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@
+                                                                \chapter@authorthree\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@
+                                                                    \chapter@authorfour\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour}\vskip12\p@
+                                                                            \chapter@authorfive\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfive}
+                                                                        \fi                                                                        
+                                                    \ifnum\c@numauthors=6
+                                                        \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                                                            \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@
+                                                                \chapter@authorthree\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@
+                                                                    \chapter@authorfour\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour}\vskip12\p@
+                                                                            \chapter@authorfive\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfive}\vskip12\p@
+                                                                                    \chapter@authorsix\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationsix}
+                                                                        \fi                                                                                                                                                
+                                                    \ifnum\c@numauthors=7
+                                                        \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                                                            \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@
+                                                                \chapter@authorthree\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@
+                                                                    \chapter@authorfour\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour}\vskip12\p@
+                                                                      \chapter@authorfive\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfive}\vskip12\p@
+                                                                       \chapter@authorsix\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationsix}\vskip12\p@
+                                                                               \chapter@authorseven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationseven}
+                                                                        \fi                                                                                                                                                
+                                                    \ifnum\c@numauthors=8
+                                                        \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                                                            \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@
+                                                                \chapter@authorthree\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@
+                                                                    \chapter@authorfour\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour}\vskip12\p@
+                                                                      \chapter@authorfive\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfive}\vskip12\p@
+                                                                       \chapter@authorsix\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationsix}\vskip12\p@
+                                                                       \chapter@authorseven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationseven}\vskip12\p@
+                                                                        \chapter@authoreight\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationeight}
+                                                                        \fi                                                                                                                                                
+                                                    \ifnum\c@numauthors=9
+                                                        \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                                                            \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@
+                                                                \chapter@authorthree\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@
+                                                                    \chapter@authorfour\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour}\vskip12\p@
+                                                                      \chapter@authorfive\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfive}\vskip12\p@
+                                                                       \chapter@authorsix\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationsix}\vskip12\p@
+                                                                       \chapter@authorseven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationseven}\vskip12\p@
+                                                                        \chapter@authoreight\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationeight}\vskip12\p@
+                                                                        \chapter@authornine\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationnine}
+                                                                        \fi                                                                                                                                                
+                                                    \ifnum\c@numauthors=10
+                                                        \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                                                            \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@
+                                                                \chapter@authorthree\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@
+                                                                    \chapter@authorfour\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour}\vskip12\p@
+                                                                      \chapter@authorfive\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfive}\vskip12\p@
+                                                                       \chapter@authorsix\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationsix}\vskip12\p@
+                                                                       \chapter@authorseven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationseven}\vskip12\p@
+                                                                        \chapter@authoreight\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationeight}\vskip12\p@
+                                                                        \chapter@authornine\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationnine}\vskip12\p@        
+                                                                        \chapter@authorten\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationten}        
+                                                                        \fi                                                                                                                                                
+                                                    \ifnum\c@numauthors=11
+                                                        \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                                                            \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@
+                                                                \chapter@authorthree\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@
+                                                                    \chapter@authorfour\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour}\vskip12\p@
+                                                                      \chapter@authorfive\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfive}\vskip12\p@
+                                                                       \chapter@authorsix\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationsix}\vskip12\p@
+                                                                       \chapter@authorseven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationseven}\vskip12\p@
+                                                                        \chapter@authoreight\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationeight}\vskip12\p@
+                                                                        \chapter@authornine\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationnine}\vskip12\p@        
+                                                                        \chapter@authorten\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationten}\vskip12\p@        
+                                                                        \chapter@authoreleven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationeleven}
+                                                                        \fi                                                                                                                                                
+                                                    \ifnum\c@numauthors=12
+                                                        \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                                                            \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@
+                                                                \chapter@authorthree\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@
+                                                                    \chapter@authorfour\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour}\vskip12\p@
+                                                                      \chapter@authorfive\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfive}\vskip12\p@
+                                                                       \chapter@authorsix\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationsix}\vskip12\p@
+                                                                       \chapter@authorseven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationseven}\vskip12\p@
+                                                                        \chapter@authoreight\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationeight}\vskip12\p@
+                                                                        \chapter@authornine\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationnine}\vskip12\p@        
+                                                                        \chapter@authorten\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationten}\vskip12\p@        
+                                                                        \chapter@authoreleven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationeleven}\vskip12\p@        
+                                                                        \chapter@authortwelve\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwelve}
+                                                                        \fi                                                                                                                                                
+                                                    \ifnum\c@numauthors=13
+                                                        \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                                                            \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@
+                                                                \chapter@authorthree\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@
+                                                                    \chapter@authorfour\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour}\vskip12\p@
+                                                                      \chapter@authorfive\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfive}\vskip12\p@
+                                                                       \chapter@authorsix\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationsix}\vskip12\p@
+                                                                       \chapter@authorseven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationseven}\vskip12\p@
+                                                                        \chapter@authoreight\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationeight}\vskip12\p@
+                                                                        \chapter@authornine\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationnine}\vskip12\p@        
+                                                                        \chapter@authorten\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationten}\vskip12\p@        
+                                                                        \chapter@authoreleven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationeleven}\vskip12\p@        
+                                                                        \chapter@authortwelve\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwelve}\vskip12\p@        
+                                                                        \chapter@authorthirteen\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthirteen}                                                                                \fi                                                                                                                                                
+                                                    \ifnum\c@numauthors=14
+                                                        \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                                                            \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@
+                                                                \chapter@authorthree\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@
+                                                                    \chapter@authorfour\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour}\vskip12\p@
+                                                                      \chapter@authorfive\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfive}\vskip12\p@
+                                                                       \chapter@authorsix\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationsix}\vskip12\p@
+                                                                       \chapter@authorseven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationseven}\vskip12\p@
+                                                                        \chapter@authoreight\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationeight}\vskip12\p@
+                                                                        \chapter@authornine\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationnine}\vskip12\p@        
+                                                                        \chapter@authorten\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationten}\vskip12\p@        
+                                                                        \chapter@authoreleven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationeleven}\vskip12\p@        
+                                                                        \chapter@authortwelve\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwelve}\vskip12\p@        
+                                                                        \chapter@authorthirteen\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthirteen}\vskip12\p@                                                                                                                                                                                              							\chapter@authorfourteen\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfourteen}
+        \fi                                                                                                                                                
+                                                    \ifnum\c@numauthors=15
+                                                        \chapter@authorone\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@
+                                                            \chapter@authortwo\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@
+                                                                \chapter@authorthree\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@
+                                                                    \chapter@authorfour\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour}\vskip12\p@
+                                                                      \chapter@authorfive\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfive}\vskip12\p@
+                                                                       \chapter@authorsix\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationsix}\vskip12\p@
+                                                                       \chapter@authorseven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationseven}\vskip12\p@
+                                                                        \chapter@authoreight\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationeight}\vskip12\p@
+                                                                        \chapter@authornine\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationnine}\vskip12\p@        
+                                                                        \chapter@authorten\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationten}\vskip12\p@        
+                                                                        \chapter@authoreleven\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationeleven}\vskip12\p@        
+                                                                        \chapter@authortwelve\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwelve}\vskip12\p@        
+                                                                        \chapter@authorthirteen\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthirteen}\vskip12\p@                                                                                                                                                                                              							\chapter@authorfourteen\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfourteen}\vskip12\p@                                                                              
+							\chapter@authorfifteen\vskip6\p@
+        {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfifteen}                                                                                       
+         \fi                                                                                                                                                
+
+}
+ \gdef\chapter@authorone{}\gdef\chapter@affiliationone{}%
+ \gdef\chapter@authortwo{}\gdef\chapter@affiliationtwo{}%
+ \gdef\chapter@authorthree{}\gdef\chapter@affiliationthree{}%
+ \gdef\chapter@authorfour{}\gdef\chapter@affiliationfour{}%
+  \gdef\chapter@authorfive{}\gdef\chapter@affiliationfive{}%
+   \gdef\chapter@authorsix{}\gdef\chapter@affiliationsix{}%
+    \gdef\chapter@authorseven{}\gdef\chapter@affiliationseven{}%
+     \gdef\chapter@authoreight{}\gdef\chapter@affiliationeight{}%
+      \gdef\chapter@authornine{}\gdef\chapter@affiliationnine{}%
+       \gdef\chapter@authorten{}\gdef\chapter@affiliationten{}%
+   \gdef\chapter@authoreleven{}\gdef\chapter@affiliationeleven{}%
+    \gdef\chapter@authortwelve{}\gdef\chapter@affiliationtwelve{}%
+     \gdef\chapter@authorthirteen{}\gdef\chapter@affiliationthirteen{}%
+      \gdef\chapter@authorfourteen{}\gdef\chapter@affiliationfourteen{}%
+       \gdef\chapter@authorfifteen{}\gdef\chapter@affiliationfifteen{}%       
+       
+  \vskip 14.6\p@
+{\leftskip\secnumwidth\def\author##1##2{}\@input{\thechapter.toc}\par}%
+  }
+\reset@authors}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\newif\iffinishedfromone
+\global\finishedfromonefalse
+%
+\newif\iffinishedfromtwo
+\global\finishedfromtwofalse
+%
+\newif\iffinishedfromthree
+\global\finishedfromthreefalse
+%
+\newif\iffinishedfromfour
+\global\finishedfromfourfalse
+%
+\newif\iffinishedfromfive
+\global\finishedfromfivefalse
+%
+\newif\iffinishedfromsix
+\global\finishedfromsixfalse
+%
+\newif\iffinishedfromseven
+\global\finishedfromsevenfalse
+%
+\newif\iffinishedfromeight
+\global\finishedfromeightfalse
+%
+\newif\iffinishedfromnine
+\global\finishedfromninefalse
+%
+\newif\iffinishedfromten
+\global\finishedfromtenfalse
+%
+\newif\iffinishedfromeleven
+\global\finishedfromelevenfalse
+%
+\newif\iffinishedfromtwelve
+\global\finishedfromtwelvefalse
+%
+\newif\iffinishedfromthirteen
+\global\finishedfromthirteenfalse
+%
+\newif\iffinishedfromfourteen
+\global\finishedfromfourteenfalse
+%
+\newif\iffinishedfromfifteen
+\global\finishedfromfifteenfalse
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+\def\singleauthorchapter{\finishedfromonetrue}
+\def\twoauthorchapter{\finishedfromtwotrue}
+\def\threeauthorchapter{\finishedfromthreetrue}
+\def\fourauthorchapter{\finishedfromfourtrue}
+\def\fiveauthorchapter{\finishedfromfivetrue}
+\def\sixauthorchapter{\finishedfromsixtrue}
+\def\sevenauthorchapter{\finishedfromseventrue}
+\def\eightauthorchapter{\finishedfromeighttrue}
+\def\nineauthorchapter{\finishedfromninetrue}
+\def\tenauthorchapter{\finishedfromtentrue}
+\def\elevenauthorchapter{\finishedfromeleventrue}
+\def\twelveauthorchapter{\finishedfromtwelvetrue}
+\def\thirteenauthorchapter{\finishedfromthirteentrue}
+\def\fourteenauthorchapter{\finishedfromfourteentrue}
+\def\fifteenauthorchapter{\finishedfromfifteentrue}
+%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\newif\iffinish
+\global\finishfalse
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\newsavebox\@AUonebox
+\newsavebox\@AUtwobox
+\newsavebox\@AUthreebox
+\newsavebox\@AUfourbox
+\newsavebox\@AUfivebox
+\newsavebox\@AUsixbox
+\newsavebox\@AUsevenbox
+\newsavebox\@AUeightbox
+\newsavebox\@AUninebox
+\newsavebox\@AUtenbox
+\newsavebox\@AUelevenbox
+\newsavebox\@AUtwelvebox
+\newsavebox\@AUthirteenbox
+\newsavebox\@AUfourteenbox
+\newsavebox\@AUfifteenbox
+
+%
+\newsavebox\@AUaffonebox
+\newsavebox\@AUafftwobox
+\newsavebox\@AUaffthreebox
+\newsavebox\@AUafffourbox
+\newsavebox\@AUafffivebox
+\newsavebox\@AUaffsixbox
+\newsavebox\@AUaffsevenbox
+\newsavebox\@AUaffeightbox
+\newsavebox\@AUaffninebox
+\newsavebox\@AUafftenbox
+\newsavebox\@AUaffelevenbox
+\newsavebox\@AUafftwelvebox
+\newsavebox\@AUaffthirteenbox
+\newsavebox\@AUafffourteenbox
+\newsavebox\@AUafffifteenbox
+
+%
+\newsavebox\@finalAUboxfromone
+\newsavebox\@finalAUboxfromtwo
+\newsavebox\@finalAUboxfromthree
+\newsavebox\@finalAUboxfromfour
+\newsavebox\@finalAUboxfromfive
+\newsavebox\@finalAUboxfromsix
+\newsavebox\@finalAUboxfromseven
+\newsavebox\@finalAUboxfromeight
+\newsavebox\@finalAUboxfromnine
+\newsavebox\@finalAUboxfromten
+\newsavebox\@finalAUboxfromeleven
+\newsavebox\@finalAUboxfromtwelve
+\newsavebox\@finalAUboxfromthirteen
+\newsavebox\@finalAUboxfromfourteen
+\newsavebox\@finalAUboxfromfifteen
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\def\@ca#1#2{%
+%  \def\chapter@author{#1}%
+%  \def\chapter@affiliation{#2}%
+  \if@filesw%
+    \write\@auxout{%
+\string\@writefile{toc}{\string\author{#1}{}}%
+}%
+  \fi
+%%%%%%%%%%%%%%%
+
+\ifnum\c@numauthors>15
+    \resetcounter{numauthors}
+\fi 
+\stepcounter{numauthors}
+%%\the\c@numauthors
+\ifnum\c@numauthors=1 % 
+    \sbox\@AUonebox{\CAPlusOneFont#1}
+    \sbox\@AUaffonebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromone{\copy\@AUonebox}
+    \def\chapter@authorone{\copy\@finalAUboxfromone}
+    \def\chapter@affiliationone{\copy\@AUaffonebox}
+\fi \ifnum\c@numauthors=2
+    \sbox\@AUtwobox{\CAPlusOneFont#1}
+    \sbox\@AUafftwobox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromtwo{\copy\@AUtwobox}
+    \def\chapter@authortwo{\copy\@finalAUboxfromtwo}
+    \def\chapter@affiliationtwo{\copy\@AUafftwobox}
+\fi \ifnum\c@numauthors=3
+    \sbox\@AUthreebox{\CAPlusOneFont#1}
+    \sbox\@AUaffthreebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromthree{\copy\@AUthreebox}
+    \def\chapter@authorthree{\copy\@finalAUboxfromthree}
+    \def\chapter@affiliationthree{\copy\@AUaffthreebox}
+\fi \ifnum\c@numauthors=4
+    \sbox\@AUfourbox{\CAPlusOneFont#1}
+    \sbox\@AUafffourbox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromfour{\copy\@AUfourbox}
+    \def\chapter@authorfour{\copy\@finalAUboxfromfour}
+    \def\chapter@affiliationfour{\copy\@AUafffourbox}
+\fi \ifnum\c@numauthors=5
+    \sbox\@AUfivebox{\CAPlusOneFont#1}
+    \sbox\@AUafffivebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromfive{\copy\@AUfivebox}
+    \def\chapter@authorfive{\copy\@finalAUboxfromfive}
+    \def\chapter@affiliationfive{\copy\@AUafffivebox}
+\fi \ifnum\c@numauthors=6
+    \sbox\@AUsixbox{\CAPlusOneFont#1}
+    \sbox\@AUaffsixbox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromsix{\copy\@AUsixbox}
+    \def\chapter@authorsix{\copy\@finalAUboxfromsix}
+    \def\chapter@affiliationsix{\copy\@AUaffsixbox}
+\fi \ifnum\c@numauthors=7
+    \sbox\@AUsevenbox{\CAPlusOneFont#1}
+    \sbox\@AUaffsevenbox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromseven{\copy\@AUsevenbox}
+    \def\chapter@authorseven{\copy\@finalAUboxfromseven}
+    \def\chapter@affiliationseven{\copy\@AUaffsevenbox}
+\fi \ifnum\c@numauthors=8
+    \sbox\@AUeightbox{\CAPlusOneFont#1}
+    \sbox\@AUaffeightbox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromeight{\copy\@AUeightbox}
+    \def\chapter@authoreight{\copy\@finalAUboxfromeight}
+    \def\chapter@affiliationeight{\copy\@AUaffeightbox}
+\fi \ifnum\c@numauthors=9
+    \sbox\@AUninebox{\CAPlusOneFont#1}
+    \sbox\@AUaffninebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromnine{\copy\@AUninebox}
+    \def\chapter@authornine{\copy\@finalAUboxfromnine}
+    \def\chapter@affiliationnine{\copy\@AUaffninebox}
+\fi \ifnum\c@numauthors=10
+    \sbox\@AUtenbox{\CAPlusOneFont#1}
+    \sbox\@AUafftenbox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromten{\copy\@AUtenbox}
+    \def\chapter@authorten{\copy\@finalAUboxfromten}
+    \def\chapter@affiliationten{\copy\@AUafftenbox}
+\fi \ifnum\c@numauthors=11
+    \sbox\@AUelevenbox{\CAPlusOneFont#1}
+    \sbox\@AUaffelevenbox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromeleven{\copy\@AUelevenbox}
+    \def\chapter@authoreleven{\copy\@finalAUboxfromeleven}
+    \def\chapter@affiliationeleven{\copy\@AUaffelevenbox}
+\fi \ifnum\c@numauthors=12
+    \sbox\@AUtwelvebox{\CAPlusOneFont#1}
+    \sbox\@AUafftwelvebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromtwelve{\copy\@AUtwelvebox}
+    \def\chapter@authortwelve{\copy\@finalAUboxfromtwelve}
+    \def\chapter@affiliationtwelve{\copy\@AUafftwelvebox}
+\fi \ifnum\c@numauthors=13
+    \sbox\@AUthirteenbox{\CAPlusOneFont#1}
+    \sbox\@AUaffthirteenbox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromthirteen{\copy\@AUthirteenbox}
+    \def\chapter@authorthirteen{\copy\@finalAUboxfromthirteen}
+    \def\chapter@affiliationthirteen{\copy\@AUaffthirteenbox}
+\fi\ifnum\c@numauthors=14
+    \sbox\@AUfourteenbox{\CAPlusOneFont#1}
+    \sbox\@AUafffourteenbox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromfourteen{\copy\@AUfourteenbox}
+    \def\chapter@authorfourteen{\copy\@finalAUboxfromfourteen}
+    \def\chapter@affiliationfourteen{\copy\@AUafffourteenbox}
+\fi\ifnum\c@numauthors=15
+    \sbox\@AUfifteenbox{\CAPlusOneFont#1}
+    \sbox\@AUafffifteenbox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}}
+    \sbox\@finalAUboxfromfifteen{\copy\@AUfifteenbox}
+    \def\chapter@authorfifteen{\copy\@finalAUboxfromfifteen}
+    \def\chapter@affiliationfifteen{\copy\@AUafffifteenbox}
+\fi}
+
+
+
+\def\@caplusone{\@ifstar{\@scaplusone}{\@ifnextchar[{\@xcaplusone}{\@xcaplusone[]}}}
+\def\@xcaplusone[#1]#2#3{%
+  \def\@@empty{#1}\ifx\@empty\@@empty\@ca{#2}{#3}\else\@ca{#2}{#1}\fi\@scaplusone{#2}{#3}}
+\def\@scaplusone#1#2{%
+  \ifhmode\vskip-12pt\fi
+%%Shashi Commented
+%%%  \noindent\hskip3pc{\CAPlusOneFont\baselineskip14pt #1\def\@t{#2}\ifx\@t\@empty\else,\fi}\hskip6pt{\CAAPlusOneFont #2}\par
+}
+
+\def\chapterauthoronly#1#2{\@ca{#1}{}\@scaplusone{#1}{#2}}
+\def\myaddcontentsline#1#2#3{%
+  \if@filesw
+    \begingroup
+    \let\label\@gobble\let\index\@gobble\let\glossary\@gobble
+    \def\break{\ }%
+    \def\protect##1{\string ##1 }%
+    \@temptokena{\thepage}%
+    \edef\@tempa{\write#1{\string\chapcontentsline{#2}{\string\raggedright\space #3}{\the\@temptokena}}}\@tempa
+%    \if@nobreak\ifvmode\nobreak\fi\fi%%%nobreak error
+    \endgroup
+  \fi}
+\def\chapcontentsline#1{\csname l@#1\endcsname}
+\def\l@chapsection{\@mydottedtocline{1}{\z@}{6pt}}
+\def\l@chapsubsection{\@mydottedtocline{2}{\secnumwidth}{6pt}}
+\def\l@chapsubsubsection{\@mydottedtocline{3}{\subsecnumwidth}{36pt}}
+\newcount\c@chaptocdepth
+\setcounter{chaptocdepth}{3}
+\def\@mytocline#1#2#3#4#5{%
+  \ifnum #1>\c@chaptocdepth
+  \else
+    \vskip 2pt plus.2\p@
+    \ifnum #1=1\ifnum\c@chaptocdepth>1\addvspace{12pt}\fi\fi
+    {\leftskip #2\relax% \rightskip \@tocrmarg \parfillskip -\rightskip
+      \interlinepenalty\@M
+      \leavevmode
+      \@tempdima #3\relax
+      \rightskip\z@
+      \vbox{\ChapTOCFont #4\nobreak}%
+      \par}\fi}
+\def\@mydottedtocline#1#2#3#4#5{%
+  \ifnum #1>\c@chaptocdepth
+  \else
+\fontsize{10}{12}\selectfont
+{\leftskip #2\relax \rightskip \@tocrmarg \parfillskip -\rightskip
+     % \parindent #2\relax\@afterindenttrue
+      \interlinepenalty\@M
+      \leavevmode
+      \def\@dotsep{1.2}%
+      \@tempdima #3\relax
+      \rightskip\z@
+%      \advance\hsize-\secnumwidth
+%      \hskip-\secnumwidth
+\if@krantzb
+\hangindent\secnumwidth\hsize372pt\else\hangindent\secnumwidth\hsize312pt\fi 
+#4
+        \if@pdf
+          \hfill
+        \else
+          \nobreak\leaders\hbox{$\m@th\mkern\@dotsep mu.\mkern\@dotsep mu$}\hfill\nobreak
+          \hbox to24\p@{\hfil #5}\fi
+      \par}\fi}      
+
+      
+\newcommand\listoffigures{%
+    \if@twocolumn
+      \@restonecoltrue\onecolumn
+    \else
+      \@restonecolfalse
+    \fi
+    \chapter*{\listfigurename}%
+      \@mkboth{\MakeUppercase\listfigurename}%
+              {\MakeUppercase\listfigurename}%
+    \@starttoc{lof}%
+    \if@restonecol\twocolumn\fi
+    }
+\newcommand*\l@figure{\@dottedtocline{1}{1.5em}{2.3em}}
+\newcommand\listoftables{%
+    \if@twocolumn
+      \@restonecoltrue\onecolumn
+    \else
+      \@restonecolfalse
+    \fi
+    \chapter*{\listtablename}%
+      \@mkboth{%
+          \MakeUppercase\listtablename}%
+         {\MakeUppercase\listtablename}%
+    \@starttoc{lot}%
+    \if@restonecol\twocolumn\fi
+    }
+\let\l@table\l@figure
+\newdimen\bibindent
+\setlength\bibindent{1.5em}
+\newenvironment{thebibliography}[1]
+     {\chapter*{\bibname}%\MakeUppercase
+      \@mkboth{\bibname}{\bibname}%\MakeUppercase
+      \list{\@biblabel{\@arabic\c@enumiv}}%
+           {\settowidth\labelwidth{\@biblabel{#1}}%
+            \leftmargin\labelwidth
+            \advance\leftmargin\labelsep
+            \@openbib@code
+            \usecounter{enumiv}%
+            \let\p@enumiv\@empty
+            \renewcommand\theenumiv{\@arabic\c@enumiv}}%
+      \sloppy
+      \clubpenalty4000
+      \@clubpenalty \clubpenalty
+      \widowpenalty4000%
+      \sfcode`\.\@m}
+     {\def\@noitemerr
+       {\@latex@warning{Empty `thebibliography' environment}}%
+      \endlist}
+\newcommand\newblock{\hskip .11em\@plus.33em\@minus.07em}
+\let\@openbib@code\@empty
+\newenvironment{theindex}
+               {\if@twocolumn
+                  \@restonecolfalse
+                \else
+                  \@restonecoltrue
+                \fi
+                \twocolumn[\@makeschapterhead{\indexname}]%
+                \@mkboth{\indexname}%\MakeUppercase
+                        {\indexname}%\MakeUppercase
+                \thispagestyle{folio}\parindent\z@\pagestyle{headings}
+                        \addcontentsline{toc}{chapter}{\indexname}
+                \parskip\z@ \@plus .3\p@\relax\raggedright
+                \columnseprule \z@
+                \columnsep 35\p@
+                \let\item\@idxitem}
+               {\if@restonecol\onecolumn\else\clearpage\fi}
+\newcommand\@idxitem{\par\hangindent 40\p@}
+\newcommand\subitem{\@idxitem \hspace*{20\p@}}
+\newcommand\subsubitem{\@idxitem \hspace*{30\p@}}
+\newcommand\indexspace{\par \vskip 10\p@ \@plus5\p@ \@minus3\p@\relax}
+\renewcommand\footnoterule{%
+  \kern-3\p@
+  \hrule\@width.4\columnwidth
+  \kern2.6\p@}
+\@addtoreset{footnote}{chapter}
+\newcommand\@makefntext[1]{%
+    \parindent 1em%
+    \noindent
+    \hb@xt@1.8em{\hss\@makefnmark}#1}
+    
+\def\grayink{\special{color cmyk 0 0 0 0.2}}
+\def\blackink{\special{color cmyk 0 0 0 1.0}} %  
+\def\whiteink{\special{color cmyk 0 0 0 0}} % 0%
+
+\usepackage{color}
+\usepackage{framed}
+
+\definecolor{shadecolor}{cmyk}{0, 0, 0, 0.2}
+
+\newenvironment{shadebox}{
+\begin{shaded*}
+}{\end{shaded*}}
+
+\newenvironment{shortbox}{
+\begin{oframed}}{\end{oframed}}
+
+
+\def\tch#1{\TableColHeadFont #1\llstrut\hfill}
+\def\tsh#1{\TableSubheadFont #1\hfill}
+\newcommand\llstrut{\rule[-6pt]{0pt}{14pt}}
+\newcommand\flstrut{\rule{0pt}{10pt}}
+\newcommand\tabletitlestrut{\rule{0pt}{20pt}}
+
+\def\Boxhead#1{\par\addvspace{3pt plus2pt}\noindent{\centering\bfseries#1\par}\vskip3pt}
+
+
+%%%%%%%%%% Note %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\newbox\tempbox
+\newdimen\notewidth
+\newenvironment{notelist}[1]{%
+\addvspace{6pt}
+			\setbox\tempbox\hbox{#1\hskip.57em}%
+   \global\notewidth\wd\tempbox
+}{%
+		\par\addvspace{6pt}}
+
+\def\notes#1#2{\par\noindent\@hangfrom{\hbox to \notewidth{\bf #1\hss}}#2\par}
+%%%%%%%%%%%%%%%% wherelist %%%%%%%%%%%%%%%%
+\newbox\wherebox
+\newdimen\wherewidth
+\newenvironment{wherelist}[1]{\leftskip10pt%
+\addvspace{6pt}
+			\setbox\wherebox\hbox{#1\hskip1em}%
+   \global\wherewidth\wd\wherebox
+\noindent\hspace*{-14pt} where
+}{%
+\par\addvspace{6pt}}
+\def\whereentry#1#2#3{\par\noindent\@hangfrom{\hbox to \wherewidth{#1\hss}#2\hskip6pt}#3\par}
+%%%%%%%%%%%%
+\newenvironment{unnumlist}{%
+  \ifnum \@enumdepth >3 \@toodeep\else
+    \advance\@enumdepth\@ne
+    \list{}{%
+\leftmargini27.5pt \leftmarginii17.5pt\leftmarginiv17.5pt
+%    \leftmargin\parindent
+    \advance\leftmargin-.2em
+    \advance\leftmarginii.2em
+    \advance\leftmarginiii.1em
+    \advance\leftmarginiv.2em
+    \def\makelabel##1{\hss\llap{##1}}}
+    \fi%
+}{%
+  \endlist}
+  %
+\newenvironment{extract}{%
+		\par\addvspace{11.5pt minus2pt}%
+  \leftskip2em\rightskip\leftskip
+ \noindent\ignorespaces
+}{%
+		\par\addvspace{11.5pt minus2pt}%
+  \@endparenv}
+%
+%
+\def\VA#1#2{\addvspace{12pt}\raggedleft #1\rightskip3em\par #2\rightskip3em}
+%
+\newenvironment{VF}{\VfFont%
+		\par\addvspace{12pt minus2pt}%
+\noindent{\vrule height2pt width\textwidth}\par\vskip7.3pt
+  \leftskip3em\rightskip\leftskip
+ \noindent\ignorespaces
+}{%
+\par\vskip6pt\leftskip0pt\noindent{{\vrule height2pt width\textwidth}}\par\addvspace{12pt minus2pt}%
+  \@endparenv}
+%
+\def\VTA#1#2{\addvspace{12pt}\raggedleft #1\rightskip3em\par {\it #2}\rightskip3em}
+%
+%
+\def\VT{\par\addvspace{3.5pt}\noindent}
+
+\def\VH#1{{\normalfont\fontsize{12.5}{14.5}\itshape\centering\selectfont #1\par}\addvspace{5.5pt}}
+%
+\newenvironment{VT1}{\VfFont%
+		\par\addvspace{12pt minus2pt}%
+\noindent{\vrule height2pt width\textwidth}\par\vskip7.5pt
+  \leftskip3em\rightskip\leftskip
+%\@afterheading
+\parindent0pt
+ \noindent\ignorespaces
+}{%
+\par\vskip6pt\leftskip0pt\noindent{{\vrule height2pt width\textwidth}}\par\addvspace{10pt minus2pt}%
+  \@endparenv}
+%
+%%%%%%%%%%%% Glossary %%%%%%%%%%%%%%%%%%%%%%%
+\newenvironment{Glossary}
+               {\list{}{\labelwidth\z@\leftmargin18pt \itemindent-18pt
+                        \let\makelabel\glosslabel}}
+               {\endlist}
+\newcommand\glosslabel[1]{\hspace\labelsep\normalfont\bfseries #1:}
+
+%%%%%%%%%%%%
+\newif\iffnalpha
+\global\fnalphafalse
+
+\newskip\listtextleftmargin\listtextleftmargin 20pt%24pt
+\newskip\listtextleftmarginii\listtextleftmarginii0pt% 24pt
+\newskip\listtextleftmarginiii\listtextleftmarginiii0pt% 24pt
+
+\newskip\listtextrightmargin\listtextrightmargin12pt%.5pc
+\newskip\listlabelleftskip \listlabelleftskip4pt%3.3pt
+\newskip\listlabelleftskipii \listlabelleftskipii0pt%3.3pt
+\newskip\listlabelleftskipiii \listlabelleftskipiii0pt%3.3pt
+
+\newskip\abovelistskipi\abovelistskipi6pt plus2pt
+\newskip\belowlistskipi\belowlistskipi6pt plus2pt
+\newskip\abovelistskipii\abovelistskipii0pt plus2pt
+\newskip\belowlistskipii\belowlistskipii0pt plus2pt
+\newskip\abovelistskipiii\abovelistskipiii0pt plus2pt
+\newskip\belowlistskipiii\belowlistskipiii0pt plus2pt
+
+\newskip\labelsepi \labelsepi6pt
+\newskip\labelsepii \labelsepii6pt
+\newskip\labelsepiii \labelsepiii6pt%\z@
+
+\newskip\itemsepi \itemsepi0pt%10pt
+\newskip\itemsepii \itemsepii0pt
+\newskip\itemsepiii \itemsepiii0pt
+
+
+\newdimen\enumdimwd
+\newif\iflabelrightalign\labelrightaligntrue
+\newdimen\enumdim%
+%
+\def\enummax#1{%
+  \labelsep\csname labelsep\romannumeral\the\@enumdepth\endcsname
+  \ifdim\listtextleftmargin>\z@\labelsepi0pt\fi
+  \ifdim\listtextleftmarginii>\z@\labelsepii0pt\fi
+  \ifdim\listtextleftmarginiii>\z@\labelsepiii0pt\fi
+  \setbox\tempbox\hbox{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname#1\hskip\labelsep}%
+  \enumdim\wd\tempbox
+  \setbox\tempbox\hbox{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname#1}%
+  \enumdimwd\wd\tempbox
+  \expandafter\global\csname leftmargin\romannumeral\the\@enumdepth\endcsname\enumdim
+  \ifdim\listtextleftmargin>\z@
+     \leftmargini\listtextleftmargin
+     \ifdim\listlabelleftskip>\z@
+        \advance\leftmargini-\listlabelleftskip
+     \fi  
+  \fi
+  \ifdim\listtextleftmarginii>\z@
+     \leftmarginii\listtextleftmarginii
+     \ifdim\listlabelleftskipii>\z@
+        \advance\leftmarginii-\listlabelleftskipii
+     \fi  
+  \fi
+  \ifdim\listtextleftmarginiii>\z@
+     \leftmarginiii\listtextleftmarginiii
+     \ifdim\listlabelleftskipiii>\z@
+        \advance\leftmarginiii-\listlabelleftskipiii
+     \fi  
+  \fi
+}
+%
+\enummax{1.}
+%
+\def\enumerate{\@ifnextchar[{\@enumerate}{\@enumerate[\csname label\@enumctr\endcsname]}}%% 
+%
+
+\def\@enumerate[#1]{\par
+      \ifnum \@enumdepth >3 \@toodeep
+      \else
+         \advance\@enumdepth\@ne
+         \edef\@enumctr{enum\romannumeral\the\@enumdepth}%
+         \setcounter{\@enumctr}{1}\enummax{#1}%
+         \list
+            {\csname label\@enumctr\endcsname}{\usecounter{\@enumctr}%
+         \topsep\csname abovelistskip\romannumeral\the\@enumdepth\endcsname
+         \itemsep\csname itemsep\romannumeral\the\@enumdepth\endcsname
+%         \listfont %\listparindent18.25pt
+         \ifnum \@enumdepth=1 \leftmargin32.7pt
+            \rightmargin\listtextrightmargin
+            \advance\rightmargin\rightskip
+            \advance\leftmargin\leftskip
+            \tempdimen\leftmargini
+            \advance\tempdimen-\labelsep
+           %%%%%%%%%%% 
+           \iffnalpha 
+            \def\makelabel##1{{\hskip\listlabelleftskip{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname{\iflabelrightalign\hss\fi\textlistlabel##1}}}}%
+            \global\fnalphafalse
+           \else
+            \def\makelabel##1{\hbox to \tempdimen{\hskip\listlabelleftskip{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname\hbox to \enumdimwd{\iflabelrightalign\hss\fi\textlistlabel##1}}\blackink}}%
+           \fi 
+           %%%%%%%%%%%%%%%%%%%%%%%%%%%
+         \else
+            \ifnum \@enumdepth=2
+               \tempdimen\leftmarginii
+               \advance\tempdimen-\labelsep
+               \def\makelabel##1{\hbox to \tempdimen{\hskip\listlabelleftskipii{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname\hbox to \enumdimwd{\iflabelrightalign\hss\fi##1}\blackink}}}%
+            \else
+               \ifnum \@enumdepth=3
+                  \tempdimen\leftmarginiii
+                  \advance\tempdimen-\labelsep
+                  \def\makelabel##1{\hbox to \tempdimen{\hskip\listlabelleftskipiii{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname\hbox to \enumdimwd{\iflabelrightalign\hss\fi##1}\blackink}}}%
+              \else
+                                       \def\makelabel##1{\hss\llap{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname##1}}%
+                                    \fi   
+                                 \fi   
+         \fi}
+      \fi}
+%
+\def\endenumerate{\@topsepadd\csname belowlistskip\romannumeral\the\@enumdepth\endcsname\endlist}%
+%
+
+\def\textlistlabel{}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\newdimen\concolwidth
+\newbox\stempbox
+\def\contributor#1#2#3{\addvspace{10pt}{%
+\setbox\stempbox\hbox{\ContributorAffiliationFont #2}
+\concolwidth\wd\stempbox
+  \noindent{\ContributorNameFont #1}\par
+  \ifdim\concolwidth>\columnwidth \vspace*{3pt} \else \fi
+  \noindent{\vbox{\hangindent12pt\ContributorAffiliationFont #2}}\vskip-1\p@
+  \noindent{\vbox{\hangindent12pt\ContributorAffiliationFont #3}}}}
+
+%%\def\contributors{%
+%%  \twocolumn[\contributorshead]
+%%  \pagestyle{empty}
+%%  \leftskip1pc
+%%  \parindent-1pc}
+%%\def\contributorshead{%
+%%  \vbox{}\vskip2pc
+%%  {\centering\HeadFont CONTRIBUTORS\vskip2\p@}
+%%  \noindent\rule{\textwidth}{1\p@}\vskip25\p@}
+
+\newenvironment{contributorlist}
+               {\cleardoublepage\if@twocolumn
+                  \@restonecolfalse
+                \else
+                  \@restonecoltrue
+                \fi
+                \twocolumn[\@makeschapterhead{Contributors}]%
+                \@mkboth{\MakeUppercase\indexname}%
+                        {\MakeUppercase\indexname}%
+                        \markboth{Contributors}{Contributors}
+                        \pagestyle{headings}
+                        \addcontentsline{toc}{fm}{Contributors}
+                \thispagestyle{folio}\parindent\z@\raggedright
+                \parskip\z@ \@plus .3\p@\relax
+                \columnseprule \z@
+                \columnsep 35\p@
+                \let\contau\@conitem}
+               {\if@restonecol\onecolumn\else\clearpage\fi}
+\newcommand\@conitem{\par\addvspace{10pt}\ContributorNameFont\hangindent 40\p@}
+\newcommand\contaff{\par\ContributorAffiliationFont\hangindent 12\p@}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\def\cleardoublepage{\clearpage\if@twoside \ifodd\c@page\else
+    \hbox{}\thispagestyle{empty}\newpage\if@twocolumn\hbox{}\newpage\fi\fi\fi}
+    
+\frenchspacing
+\tolerance=5000    
+\raggedbottom    
+    
+    
+    
+\newcommand\contentsname{Contents}
+\newcommand\listfigurename{List of Figures}
+\newcommand\listtablename{List of Tables}
+\newcommand\bibname{Bibliography}
+\newcommand\indexname{Index}
+\newcommand\figurename{FIGURE}
+\newcommand\tablename{TABLE}
+\newcommand\partname{Part}
+\newcommand\chaptername{Chapter}
+\newcommand\appendixname{Appendix}
+\def\today{\ifcase\month\or
+  January\or February\or March\or April\or May\or June\or
+  July\or August\or September\or October\or November\or December\fi
+  \space\number\day, \number\year}
+\setlength\columnsep{10\p@}
+\setlength\columnseprule{0\p@}
+\pagestyle{headings}
+\pagenumbering{arabic}
+\if@twoside
+\else
+  \raggedbottom
+\fi
+\if@twocolumn
+  \twocolumn
+  \sloppy
+  \flushbottom
+\else
+  \onecolumn
+\fi
+\newcommand\unnumcrcrule{\hbox to\textwidth{\rlap{\rule[-3.5\p@]{84\p@}{4\p@}}}}
+\newcommand\unnumchap@rule{\unnumcrcrule}
+\newcommand\crcrule{\hbox to\textwidth{\rlap{\rule[-3.5\p@]{84\p@}{4\p@}}\rule{\textwidth}{.5\p@}}}
+\newcommand\chap@rule{\crcrule}
+\newcommand\sec@rule{\crcrule}
+\def\@affiliate[#1]{\gdef\@affiliation{#1}}
+\def\@affiliation{}
+
+\@centertabletitlefalse
+%\HeadingsBookChapter
+\HeadingsChapterSection
+
+
+\endinput
+%%
+%% End of file `krantz.cls'.
diff --git a/linear-models/association-not-causation.qmd b/linear-models/association-not-causation.qmd
index 4d52319..d4faa33 100644
--- a/linear-models/association-not-causation.qmd
+++ b/linear-models/association-not-causation.qmd
@@ -222,7 +222,7 @@ There seems to be association. The plot suggests that women were much more likel
 
 ### Confounding explained graphically
 
-FIX BY WHAT The following plot shows the number of applicants that were admitted and those that were not by:
+The following plot shows the number of applicants that were admitted and those that were not by major and gender:
 
 ```{r confounding, echo=FALSE}
 admissions |>
@@ -331,8 +331,8 @@ Construct the two-by-two table used for the conclusion about differences in awar
 
 5\. To check if this is a case of Simpson's paradox, plot the success rates versus disciplines, which have been ordered by overall success, with colors to denote the genders and size to denote the number of applications.
 
-6\. We definitely do not see the same level of confounding as in the UC Berkeley example. It is hard to say that there is a confounder here. However, we do see that, based on the observed rates, some fields favor men and others favor women. We also see that the two fields with the largest difference favoring men are also the fields with the most applications. But, unlike the UC Berkeley example, women are not more likely to apply for the harder subjects. So perhaps some of the selection committees are biased and others are not.
+6\. We definitely do not see the same level of confounding as in the UC Berkeley example. It is hard to say that there is a clear confounder here. However, we do see that, based on the observed rates, some fields favor men and others favor women. We also see that the two fields with the largest difference favoring men are also the fields with the most applications. But, unlike the UC Berkeley example, women are not more likely to apply for the harder subjects. Is it possible some of the selection committees are biased and others are not?
 
-FIX CHECK I REPHRASED However, before concluding this, we must check if any of the differences seen above are statistically significant or if they could occur by chance. Keep in mind that even when there is no bias, we will see differences due to random variability in the review process as well as random variability across candidates. Perform a Chi-square test for each discipline. Hint: define a function that receives the total of a two-by-two table and returns a data frame with the p-value. Use the 0.5 correction. Then use the `summarize` function.
+To answer this question we start by checking if any of the differences seen above are statistically significant. Remember that even when there is no bias, we will see differences due to random variability in the review process as well as random variability across candidates. Perform a Chi-square test for each discipline. Hint: define a function that receives the total of a two-by-two table and returns a data frame with the p-value. Use the 0.5 correction. Then use the `summarize` function.
 
-7\. In the medical sciences, there appears to be a statistically significant difference, but could this be a spurious correlation? We performed 9 tests. Reporting only the one case with a p-value less than 0.05 might be considered an example of cherry picking. Repeat the exercise above, but instead of a p-value, compute a log odds ratio divided by their standard error. Then use qqplot to see how much these log odds ratios deviate from the normal distribution we would expect: a standard normal distribution.
+7\. In the medical sciences, there appears to be a statistically significant difference, but could this be a spurious correlation? We performed 9 tests. Reporting only the one case with a p-value less than 0.05 might be considered an example of cherry picking. Repeat the exercise above, but instead of a p-value, compute a log odds ratio divided by their standard error. Then use qq-plot to see how much these log odds ratios deviate from the normal distribution we would expect: a standard normal distribution.
diff --git a/linear-models/measurement-error-models.qmd b/linear-models/measurement-error-models.qmd
index d35c784..2e3b538 100644
--- a/linear-models/measurement-error-models.qmd
+++ b/linear-models/measurement-error-models.qmd
@@ -2,6 +2,8 @@
 
 Another major application of linear models occurs in measurement errors models. In these situations, non-random covariates, such as time, are frequently encountered, with randomness often arising from measurement errors rather than from sampling or inherent natural variability.
 
+## Example: modeling a falling object
+
 To understand these models, imagine you are Galileo in the 16th century trying to describe the velocity of a falling object. An assistant climbs the Tower of Pisa and drops a ball, while several other assistants record the position at different times. Let's simulate some data using the equations we currently know and adding some measurement error. The **dslabs** function `rfalling_object` generates these simulations:
 
 ```{r, message=FALSE, warning=FALSE, cache=FALSE}
@@ -37,6 +39,8 @@ with $Y_i$ representing distance in meters, $x_i$ representing time in seconds,
 
 Note that this is a linear model because it is a linear combination of known quantities ($x$ and $x^2$ are known) and unknown parameters (the $\beta$s are unknown parameters to Galileo). Unlike our previous examples, here $x$ is a fixed quantity; we are not conditioning.
 
+## Estimating parameters with least squares
+
 To pose a new physical theory and start making predictions about other falling objects, Galileo needs actual numbers, rather than unknown parameters. Using LSE seems like a reasonable approach. How do we find the LSE?
 
 LSE calculations do not require the errors to be approximately normal. The `lm` function will find the $\beta$s that will minimize the residual sum of squares:
@@ -76,20 +80,20 @@ The Tower of Pisa height is within the confidence interval for $\beta_0$, the in
 
 ## Exercises 
 
-1\. Plot of co2 levels for the first 12 months of the `co2` dataset and notice it seems to follow a sin wave with frequency 1 cycle per month. This means that a measurement error model that might work is 
+1\. Plot CO2 levels for the first 12 months of the `co2` dataset and notice it seems to follow a sin wave with a frequency of 1 cycle per month. This means that a measurement error model that might work is 
 
 $$
-y_i = \mu + A \sin(2\pi t_i / 12 + \phi) + \varepsilon_i
+y_i = \mu + A \sin(2\pi \,t_i / 12 + \phi) + \varepsilon_i
 $$
-FIX with $t_i$ the month number of observation $i$. Is this a linear model for the parameters $mu$, $A$ and $\phi$?
+with $t_i$ the month number for observation $i$. Is this a linear model for the parameters $mu$, $A$ and $\phi$?
 
 
 2\. Using trigonometry, we can show that we can rewrite this model as:
 
 $$
 y_i = \beta_0 + \beta_1 \sin(2\pi t_i/12) + \beta_2 \cos(2\pi t_i/12) + \varepsilon_i
-
 $$
+
 Is this a linear model? 
 
 3\. Find least square estimates for the $\beta$s using `lm`. Show a plot of $y_i$ versus $t_i$ with a curve on the same plot showing $\hat{Y}_i$ versus $t_i$.
diff --git a/linear-models/multivariate-regression.qmd b/linear-models/multivariate-regression.qmd
index 464bfae..d191763 100644
--- a/linear-models/multivariate-regression.qmd
+++ b/linear-models/multivariate-regression.qmd
@@ -105,14 +105,17 @@ One of Bill James' first important insights is that the batting average ignores
 
 ### Base on balls or stolen bases?
 
-One of the challenges in this analysis is that it is not obvious how to determine if a player produces runs because so much depends on his teammates. We do keep track of the number of runs scored by a player. FIX However, remember that if player X bats right before someone who hits many HRs, batter X will score many runs. But these runs don't necessarily happen if we hire player X, but not his HR hitting teammate. However, we can examine team-level statistics. How do teams with many SB compare to teams with few? How about BB? We have data! Let's examine some. FIX We start by creating with statistics from 1962, the first year all teams played 162 games (like today) instead of 154, to 2001, the year before the year for which we will construct a team. We convert the data to a _per game_ rate, because a small proportion of seasons had less games than usual due to strikes, and some teams played extra games due to tie breakers.
+One of the challenges in this analysis is that it is not obvious how to determine if a player produces runs because so much depends on his teammates. Although we keep track of the number of runs scored by a player, remember that if player X bats right before someone who hits many HRs, batter X will score many runs. Note these runs don't necessarily happen if we hire player X, but not his HR hitting teammate. 
+
+However, we can examine team-level statistics. How do teams with many SB compare to teams with few? How about BB? We have data! Let's examine some. We start by creating a data frame with statistics from 1962 (the first year all teams played 162 games, like today, instead of 154) to 2001 (the year before the year for which we will construct a team). We convert the data to a _per game_ rate, because a small proportion of seasons had less games than usual due to strikes, and some teams played extra games due to tie breakers.
 
 ```{r, cache=FALSE}
 library(tidyverse)
 library(Lahman)
 dat <- Teams |> filter(yearID %in% 1962:2002) |>
   mutate(team = teamID, year = yearID, r = R/G, 
-         singles = (H - X2B - X3B - HR)/G, doubles = X2B/G, triples = X3B/G, hr = HR/G,
+         singles = (H - X2B - X3B - HR)/G, 
+         doubles = X2B/G, triples = X3B/G, hr = HR/G,
          sb = SB/G, bb = BB/G) |>
   select(team, year, r, singles, doubles, triples, hr, sb, bb)
 ```
@@ -148,7 +151,7 @@ Linear regression will help us parse out the information and quantify the associ
 
 ### Regression applied to baseball statistics
 
-Can we use regression with these data? First, notice that the HR and Run data, shown above, appear to be bivariate normal. Specifically, the qqplots confirm that the normal approximation for each HR strata is useful here:
+Can we use regression with these data? First, notice that the HR and Run data, shown above, appear to be bivariate normal. Specifically, the qq-plots confirm that the normal approximation for each HR strata is useful here:
 
 ```{r hr-by-runs-qq}
 dat |> mutate(z_hr = round(scale(hr))) |>
@@ -194,13 +197,7 @@ Given that the outcome is a data frame, we can immediately use it with `summariz
 
 Now we return to discussing our original task of determining if slopes changed. The plot we just made, using `summarize` and `tidy`, shows that the confidence intervals overlap, which provides a nice visual confirmation that our assumption that the slope does not change is safe.
 
-The other functions provided by **broom**, `glance` and `augment`, relate to model-specific and observation-specific outcomes, respectively. FIX Here we can see the model fit summaries `glance` returns:
-
-```{r}
-glance(fit)
-```
-
-You can learn more about these summaries in any regression textbook.
+The other functions provided by **broom**, `glance` and `augment`, relate to model-specific and observation-specific outcomes, respectively. 
 
 ## Confounding
 
@@ -465,7 +462,7 @@ our_team <- players |>
   filter(lp_solution$solution == 1) |>
   arrange(desc(r_hat))
 tmp <- our_team |> select(nameFirst, nameLast, POS, salary, r_hat) 
-if(knitr::is_html_output()){
+if (knitr::is_html_output()) {
   knitr::kable(tmp, "html") |>
     kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)
 } else{
@@ -488,7 +485,7 @@ tmp <- players |> mutate(bb = my_scale(bb),
   filter(playerID %in% our_team$playerID) |>
   select(nameLast, bb, singles, doubles, triples, hr, avg, r_hat) |>
   arrange(desc(r_hat)) 
-if(knitr::is_html_output()){
+if (knitr::is_html_output()) {
   knitr::kable(tmp, "html") |>
     kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)
 } else{
@@ -501,13 +498,13 @@ if(knitr::is_html_output()){
 
 We have shown how BB and singles have similar predictive power for scoring runs. Another way to compare the usefulness of these baseball metrics is by assessing their stability across the years. Since we have to pick players based on their previous performances, we prefer metrics that are more stable. In these exercises, we will compare the stability of singles and BBs.
 
-1\. Before we begin, we want to generate two tables. One for 2002 and another for the average of 1999-2001 seasons. We want to define per plate appearance statistics. FIX Here is how we create the 2017 table. Keeping only players with more than 100 plate appearances.
+1\. Before we begin, we want to generate two tables. One for 2002 and another for the average of 1999-2001 seasons. We want to define per plate appearance statistics. Here is how we create the 2017 table, keeping only players with more than 100 plate appearances:
 
 ```{r, eval=FALSE}
 library(Lahman)
 dat <- Batting |> filter(yearID == 2002) |>
   mutate(pa = AB + BB, 
-         singles = (H - X2B - X3B - HR) / pa, bb = BB / pa) |>
+         singles = (H - X2B - X3B - HR)/pa, bb = BB/pa) |>
   filter(pa >= 100) |>
   select(playerID, singles, bb)
 ```
diff --git a/linear-models/regression.qmd b/linear-models/regression.qmd
index 05d0756..9fbca69 100644
--- a/linear-models/regression.qmd
+++ b/linear-models/regression.qmd
@@ -51,13 +51,14 @@ $$
 \rho = \frac{1}{n} \sum_{i=1}^n \left( \frac{x_i-\mu_x}{\sigma_x} \right)\left( \frac{y_i-\mu_y}{\sigma_y} \right)
 $$ 
 
-with $\mu_x, \mu_y$ the averages of $x_1,\dots, x_n$ and $y_1, \dots, y_n$, respectively, and $\sigma_x, \sigma_y$ the standard deviations. The Greek letter $\rho$ is commonly used in statistics books to denote the correlation. FIX The Greek letter for $r$, $\rho$, because it is the first letter of regression. Soon we learn about the connection between correlation and regression. We can represent the formula above with R code using:
+with $\mu_x, \mu_y$ the averages of $x_1,\dots, x_n$ and $y_1, \dots, y_n$, respectively, and $\sigma_x, \sigma_y$ the standard deviations. The Greek letter for $r$, $\rho$ is commonly used in statistics books to denote the correlation. It is not a coincidence that $r$ is the first letter in "regression". Soon we learn about the connection between correlation and regression. 
+
+We can represent the formula above with R code using:
 
 ```{r, eval=FALSE}
 rho <- mean(scale(x) * scale(y))
 ```
 
-
 To understand why this equation does in fact summarize how two variables move together, consider the $i$-th entry of $x$ is $\left( \frac{x_i-\mu_x}{\sigma_x} \right)$ SDs away from the average. Similarly, the $y_i$ that is paired with $x_i$, is $\left( \frac{y_1-\mu_y}{\sigma_y} \right)$ SDs away from the average $y$. If $x$ and $y$ are unrelated, the product $\left( \frac{x_i-\mu_x}{\sigma_x} \right)\left( \frac{y_i-\mu_y}{\sigma_y} \right)$ will be positive ( $+ \times +$ and $- \times -$ ) as often as negative ($+ \times -$ and $- \times +$) and will average out to about 0. This correlation is the average and therefore unrelated variables will have 0 correlation. If instead the quantities vary together, then we are averaging mostly positive products ($+ \times +$ and $- \times -$) and we get a positive correlation. If they vary in opposite directions, we get a negative correlation.
 
 The correlation coefficient is always between -1 and 1. We can show this mathematically: consider that we can't have higher correlation than when we compare a list to itself (perfect correlation) and, in this case, the correlation is:
@@ -78,7 +79,7 @@ galton_heights |> summarize(r = cor(father, son)) |> pull(r)
 ```
 
 :::{callout-warning}
-FIX For reasons similar to those explained in Section @sec-population-sd, for the standard deviation, `cor(x,y)` divides by `length(x)-1` rather than `length(x)`. 
+The function `cor(x, y)` computes the sample correlation, which divides the sum of products by `length(x)-1` rather than `length(x)`. The rationale for this is akin to the reason we divide by `length(x)-1` when computing the sample standard deviation `sd(x)`. Namely, this adjustment helps account for the degrees of freedom in the sample, which is necessary for unbiased estimates.
 :::
 
 To see what data looks like for different values of $\rho$, here are six examples of pairs with correlations ranging from -0.9 to 0.99:
@@ -200,7 +201,7 @@ fathers that are exactly 72 inches. If we change the number to 72.5, we get even
 sum(galton_heights$father == 72.5)
 ```
 
-FIX A practical way to improve these estimates of the conditional expectations is to define strata of with similar values of $x$. In our example, we can round father heights to the nearest inch and assume that they are all 72 inches. If we do this, we end up with the following prediction for the son of a father that is 72 inches tall:
+A practical way to improve these estimates of the conditional expectations is to define strata of observations with similar value of $x$. In our example, we can round father heights to the nearest inch and assume that they are all 72 inches. If we do this, we end up with the following prediction for the son of a father that is 72 inches tall:
 
 ```{r}
 conditional_avg <- galton_heights |> 
@@ -332,13 +333,15 @@ So why not always use the regression for prediction? Because it is not always ap
 
 ## Bivariate normal distribution
 
-Correlation and the regression slope are a widely used summary statistic, but they are often misused or misinterpreted. FIX Anscombe's examples provide over-simplified cases of dataset in which summarizing with correlation would be a mistake. But there are many more real-life examples.
+Correlation and the regression slope are a widely used summary statistic, but they are often misused or misinterpreted. Anscombe's examples provide over-simplified cases in which the correlation is not a useful summary. But there are many real-life examples. 
 
-The main way we motivate the use of correlation involves what is called the *bivariate normal distribution*.
+The main way we motivate appropriate use of correlation as a summary, involves the *bivariate normal distribution*.
 
-When a pair of random variables is approximated by the bivariate normal distribution, scatterplots look like ovals. As we saw in Section @sec-corr-coef), they can be thin (high correlation) or circle-shaped (no correlation).
+When a pair of random variables is approximated by the bivariate normal distribution, scatterplots look like ovals. As we saw in @sec-corr-coef, they can be thin (high correlation) or circle-shaped (no correlation).
 
-A more technical way to define the bivariate normal distribution is the following: if $X$ is a normally distributed random variable, $Y$ is also a normally distributed random variable, and the conditional distribution of $Y$ for any $X=x$ is approximately normal, then the pair is approximately bivariate normal. When three or more variables have the property that each pair is bivariate normal, we say the variables follow a *multivariate* normal distribution or that they are *jointly normal*.
+A more technical way to define the bivariate normal distribution is the following: if $X$ is a normally distributed random variable, $Y$ is also a normally distributed random variable, and the conditional distribution of $Y$ for any $X=x$ is approximately normal, then the pair is approximately bivariate normal. 
+
+When three or more variables have the property that each pair is bivariate normal, we say the variables follow a *multivariate* normal distribution or that they are *jointly* normal.
 
 If we think the height data is well approximated by the bivariate normal distribution, then we should see the normal approximation hold for each strata. Here we stratify the son heights by the standardized father heights and see that the assumption appears to hold:
 
@@ -466,15 +469,18 @@ Later, specifically in Sections @sec-multivariate-regression and @treatment-effe
 
 ## Least Squares Estimates  {#sec-lse}
 
-For linear models to be useful, we have to estimate the unknown $\beta$s. The standard approach in science is to find the values that minimize the distance of the fitted model to the data. The following is called the least squares (LS) equation and we will see it often in this chapter. For Galton's data, we would write:
+For linear models to be useful, we have to estimate the unknown $\beta$s. The standard approach is to find the values that minimize the distance of the fitted model to the data. Specifically, we find the $\beta$s that minize the least squares (LS) equation show below. For Galton's data, the LS equation looks like this:
 
 $$ 
 RSS = \sum_{i=1}^n \left\{  y_i - \left(\beta_0 + \beta_1 x_i \right)\right\}^2 
 $$
 
-This quantity is called the residual sum of squares (RSS). Once we find the values that minimize the RSS, we will call the values the least squares estimates (LSE) and denote them with $\hat{\beta}_0$ and $\hat{\beta}_1$. Let's demonstrate this with the previously defined dataset:
+The quantity we try to minimize is called the residual sum of squares (RSS). 
+
+Once we find the values that minimize the RSS, we will call the values the least squares estimates (LSE) and denote them by placing a _hat_ over the parameters. In our example we use $\hat{\beta}_0$ and $\hat{\beta}_1$.
 
-```{r, message=FALSE}
+
+```{r, message=FALSE, echo=FALSE}
 library(HistData)
 set.seed(1983)
 galton_heights <- GaltonFamilies |>
@@ -486,7 +492,8 @@ galton_heights <- GaltonFamilies |>
   rename(son = childHeight)
 ```
 
-Let's write a function that computes the RSS for any pair of values $\beta_0$ and $\beta_1$.
+We will demonstrate how we find these values using the previously defined `galton_heights` dataset. Let's start bywriting a function that computes the RSS for any pair of values $\beta_0$ and $\beta_1$.
+
 
 ```{r}
 rss <- function(beta0, beta1, data){
@@ -507,7 +514,7 @@ results |> ggplot(aes(beta1, rss)) + geom_line() +
 
 We can see a clear minimum for $\beta_1$ at around 0.65. However, this minimum for $\beta_1$ is for when $\beta_0 = 25$, a value we arbitrarily picked. We don't know if (25, 0.65) is the pair that minimizes the equation across all possible pairs.
 
-Trial and error is not going to work in this case. We could search for a minimum within a fine grid of $\beta_0$ and $\beta_1$ values, but this is unnecessarily time-consuming since we can use calculus: take the partial derivatives, set them to 0, and solve for $\beta_1$ and $\beta_2$. Of course, if we have many parameters, these equations can get rather complex. But there are functions in R that do these calculations for us. We will study these next. To learn the mathematics behind this, you can consult a book on linear models.
+Trial and error is not going to work in this case. We could search for a minimum within a fine grid of $\beta_0$ and $\beta_1$ values, but this is unnecessarily time-consuming since we can use calculus. Specifically, we take the partial derivatives, set them to 0, and solve for $\beta_1$ and $\beta_2$. Of course, if we have many parameters, these equations can get rather complex. But there are functions in R that do these calculations for us. We will study these next. To learn the mathematics behind this, you can consult a book on linear models.
 
 ## The `lm` function
 
@@ -521,7 +528,7 @@ with $Y_i$ being the son's height and $x_i$ being the father's height, we can us
 
 ```{r}
 fit <- lm(son ~ father, data = galton_heights)
-fit$coef
+fit$coefficients
 ```
 
 The most common way we use `lm` is by using the character `~` to let `lm` know which is the variable we are predicting (left of `~`) and which we are using to predict (right of `~`). The intercept is added automatically to the model that will be fit.
@@ -534,7 +541,7 @@ summary(fit)
 
 To understand some of the information included in this summary, we need to remember that the LSE are random variables. Mathematical statistics gives us some ideas of the distribution of these random variables.
 
-In Section @sec-multivariate-regression, after describing a more complex case study, we gain further insights into the application of regression in R.
+In @sec-multivariate-regression, after describing a more complex case study, we gain further insights into the application of regression in R.
 
 ## LSE are random variables
 
@@ -562,7 +569,9 @@ p2 <- lse |> ggplot(aes(beta_1)) +
 grid.arrange(p1, p2, ncol = 2)
 ```
 
-The reason these look normal is because the central limit theorem applies here as well: for large enough $N$, the least squares estimates will be approximately normal with expected value $\beta_0$ and $\beta_1$, respectively. The standard errors are a bit complicated to compute, but mathematical theory does allow us to compute them and they are included in the summary provided by the `lm` function. FIX Here it is for one of our simulated data sets:
+The reason these look normal is because the central limit theorem applies here as well: for large enough $N$, the least squares estimates will be approximately normal with expected value $\beta_0$ and $\beta_1$, respectively. 
+
+The standard errors are a bit complicated to compute, but mathematical theory does allow us to compute them and they are included in the summary provided by the `lm` function. The function `summary` shows us the standard error estimates:
 
 ```{r}
 sample_n(galton_heights, N, replace = TRUE) |> 
@@ -571,13 +580,13 @@ sample_n(galton_heights, N, replace = TRUE) |>
   coef()
 ```
 
-You can see that the standard errors estimates reported by the `summary` are close to the standard errors from the simulation:
+You can see that the standard errors estimates reported above are close to the standard errors from the simulation:
 
 ```{r}
 lse |> summarize(se_0 = sd(beta_0), se_1 = sd(beta_1))
 ```
 
-The `summary` function also reports t-statistics (`t value`) and p-values (`Pr(>|t|)`). The t-statistic is not actually based on the central limit theorem, but rather on the assumption that the $\varepsilon$s follow a normal distribution. Under this assumption, mathematical theory tells us that the LSE divided by their standard error, $\hat{\beta}_0 / \hat{\mbox{SE}}(\hat{\beta}_0 )$ and $\hat{\beta}_1 / \hat{\mbox{SE}}(\hat{\beta}_1 )$, follow a t-distribution with $N-p$ degrees of freedom, with $p$ the number of parameters in our model. In the case of height $p=2$, the two p-values are testing the null hypothesis that $\beta_0 = 0$ and $\beta_1=0$, respectively.
+The `summary` function also reports t-statistics (`t value`) and p-values (`Pr(>|t|)`). The t-statistic is not actually based on the central limit theorem, but rather on the assumption that the $\varepsilon$s follow a normal distribution. Under this assumption, mathematical theory tells us that the LSE divided by their standard error, $\hat{\beta}_0 / \hat{\mbox{SE}}(\hat{\beta}_0 )$ and $\hat{\beta}_1 / \hat{\mbox{SE}}(\hat{\beta}_1 )$, follow a t-distribution with $N-p$ degrees of freedom, with $p$ the number of parameters in our model. In our example  $p=2$, and the two p-values are obtained from testing the null hypothesis that $\beta_0 = 0$ and $\beta_1=0$, respectively.
 
 Remember that, as we described in Section @sec-t-dist, for large enough $N$, the CLT works and the t-distribution becomes almost the same as the normal distribution. Also, notice that we can construct confidence intervals, but we will soon learn about **broom**, an add-on package that makes this easy.
 
@@ -651,8 +660,7 @@ plot(fit, which = 1:3)
 
 This function can produce six different plots, and the argument `which` let's you specify which you want to see. You can learn more by reading the `plot.lm` help file. However, some of the plots are based on more advanced concepts beyond the scope of this book. To learn more, we recommend an advanced book on regression analysis.
 
-FIX In Sections @sec-multivariate-regression and @sec-treatment-effect-models, we introduce data analysis challenges in which more than one variables some not included in the model. FIX In these cases, an important diagnostic test to add checks if the residuals are related to variables not included in the model.
-
+In @sec-multivariate-regression and @sec-treatment-effect-models, we introduce data analysis challenges in which we may decide to not to include certain variables in the model. In these cases, an important diagnostic test to add checks if the residuals are related to variables not included in the model.
 
 
 ## The regression fallacy
diff --git a/linear-models/treatment-effect-models.qmd b/linear-models/treatment-effect-models.qmd
index 6d5754f..67ea19f 100644
--- a/linear-models/treatment-effect-models.qmd
+++ b/linear-models/treatment-effect-models.qmd
@@ -42,75 +42,92 @@ mice_weights |> group_by(diet) |> summarize(average = mean(body_weight))
 
 However, this is a random sample of mice, and the assignment to the diet group is also done randomly. So is this difference due to chance? We will use hypothesis testing, first described in @sec-hypothesis-testing, to answer this question.
 
-Let $\mu_1$ and $\sigma_1$ represent the weighted average and standard deviation, respectively, that we would observe if the entire population of mice were on the high-fat diet. Define $\mu_0$ and $\sigma_0$ similarly for the chow diet. FIX UNCLEAR Define $N_1$ and $N_0$ as the sample sizes, let's call them $\bar{X}_1$ and $\bar{X}_0$ as the sample averages, and $s_1$ and $s_0$ the sample standard deviations for the for the high-fat and chow diets, respectively. Since this is a random sample, the central limit theorem tells us that the difference in averages $bar{X}_1 - \bar{X}_0$ follows a normal distribution, with expected value $\mu_1-\mu_0$ and standard error $\sqrt{\frac{s_1^2}{N_1} + \frac{s_0^2}{N_0}}$. If we define the null hypothesis as the high-fat diet having no effect, or $\mu_1 - \mu_0 = 0$, the following summary statistic:
+Let $\mu_1$ and $\sigma_1$ represent the weight average and standard deviation, respectively, that we would observe if the entire population of mice were on the high-fat diet. Define $\mu_0$ and $\sigma_0$ similarly, but for the chow diet. Define $N_1$ and $N_0$ as the sample sizes, and $\bar{X}_1$ and $\bar{X}_0$ the sample averages, for the for the high-fat and chow diets, respectively. 
+
+Since the data comes from a random sample, the central limit theorem tells us that, if the sample is large enough, the difference in averages $bar{X}_1 - \bar{X}_0$ follows a normal distribution, with expected value $\mu_1-\mu_0$ and standard error $\sqrt{\frac{\sigma_1^2}{N_1} + \frac{\sigma_0^2}{N_0}}$. 
+
+If we define the null hypothesis as the high-fat diet having no effect, or $\mu_1 - \mu_0 = 0$, this implies that 
+
+$$
+\frac{\bar{X}_1 - \bar{X}_0}{\sqrt{\frac{\sigma_1^2}{N_1} + \frac{\sigma_0^2}{N_0}}}
+$$
+
+has expected value 0 and standard error 1 and therefore approximately follows a standard normal distribution.
+
+Note that we can't compute this quantity in practice because the $\sigma_1$ and $\sigma_0$ are unknown. However, if we estimate them with the sample standard deviations, denote them $s_1$ and $s_0$ for the high-fat and chow diets, respectively, the central limit still holds and tells us that 
 
 
 $$
 t = \frac{\bar{X}_1 - \bar{X}_0}{\sqrt{\frac{s_1^2}{N_1} + \frac{s_0^2}{N_0}}}
 $$
 
-follows a standard normal distribution when the null hypothesis is true.  This implies that we can easily compute the probability of observing a value as large as the one we obtained: 
+follows a standard normal distribution when the null hypothesis is true.
+This implies that we can easily compute the probability of observing a value as large as the one we obtained: 
 
 ```{r}
-stats <- mice_weights |> group_by(diet) |> summarize(xbar = mean(body_weight), s = sd(body_weight), n = n()) 
+stats <- mice_weights |> 
+  group_by(diet) |> 
+  summarize(xbar = mean(body_weight), s = sd(body_weight), n = n()) 
 t_stat <- with(stats, (xbar[2] - xbar[1])/sqrt(s[2]^2/n[2] + s[1]^2/n[1]))
 t_stat
 ```
 
 Here $t$ is well over 3, so we don't really need to compute the p-value `1-pnorm(t_stat)` as we know it will be very small.
 
-Note that when $N$ is not large, then the CLT does not apply. However, if the outcome data, in this case weight, follows a normal distribution, then $t$ follows a t-distribution with $N_1+N_2-2$ degrees of freedom. So the calculation of the p-value is the same except that we use `1-pt(t_stat, with(stats, n[2]+n[1]-2)` to compute the p-value. 
+Note that when $N$ is not large enough, then the CLT does not apply. However, if the outcome data, in this case weight, follows a normal distribution, then $t$ follows a t-distribution with $N_1+N_2-2$ degrees of freedom. So the calculation of the p-value is the same except that we use `pt` instead of `pnorm`. Specifically, we use `1-pt(t_stat, with(stats, n[2]+n[1]-2)`. 
 
-Given the common use of differences in means in scientific studies, this _t-statistic_ is one of the most widely reported summaries. FIX When use it in a hypothesis testing setting, it is referred to as performing a _t test_. 
+Differences in means are commonly examined in the scientific studies. As a result this _t-statistic_ is one of the most widely reported summaries. When used to determine if an observed difference is _statistically significant_, we refer to the procedure as "performing a _t test_". 
 
 
 :::{.callout-warning}
-In the computation above, we computed the probability of `t` being as large as what we observed. However, when our interest spans both directions, for example, either an increase or decrease in weight, we need to compute the probability of `t` being _as extreme_ as what we observe. The formula simply changes to using the absolute value: `1 - pnorm(abs(t-test))` or `1-pt(t_stat, with(stats, n[2]+n[1]-2)`.
+In the computation above, we computed the probability of `t` being as large as what we observed. However, when our interest spans both directions, for example, either an increase or decrease in weight, we need to compute the probability of `t` being _as extreme_ as what we observe. The formula simply changes to using the absolute value: `1 - pnorm(abs(t-test))` or `1-pt(abs(t_stat), with(stats, n[2]+n[1]-2)`.
 :::
 
 ## One factor design
 
-Although the t-test is useful for cases in which we only account for two treatments, it is common to have other variables affect our outcomes. Linear models permit hypothesis testing in these more general situations. We start the description of the use of linear models for estimating treatment effects by demonstrating how they can be used to perform t-tests.
+Although the t-test is useful for cases in which we compare two treatments, it is common to have other variables affect our outcomes. Linear models permit hypothesis testing in these more general situations. We start the description of the use of linear models for estimating treatment effects by demonstrating how they can be used to perform t-tests.
 
 If we assume that the weight distributions for both chow and high-fat diets are normally distributed, we can write the following linear model to represent the data:
 
 $$
 Y_i = \beta_0 + \beta_1 x_i + \varepsilon_i
 $$
+
 with $X_i$ 1, if the $i$-th mice was fed the high-fat diet, and 0 otherwise, and the errors $\varepsilon_i$ independent and normally distributed with expected value 0 and standard deviation $\sigma$. Note that this mathematical formula looks exactly like the model we wrote out for the father-son heights. However, the fact that $x_i$ is now 0 or 1 rather than a continuous variable, allows us to use it in this different context. In particular, notice that now $\beta_0$ represents the population average height of the mice on the chow diet and $\beta_0 + \beta_1$ represents the population average for the weight of the mice on the high-fat diet. 
 
-A nice feature of this model is that $\beta_1$ represents the _treatment effect_ of receiving the high-fat diet. FIX If the null hypothesis that the high-fat diet has no effect can be quantified as $\beta_1 = 0$. FIX We can then estimate $\beta_1$ and answer the question of weather or not the observed difference is real by computing the estimates being as large as it was under the null. So how do we estimate $\beta_1$ and a standard error for the estimate?
+A nice feature of this model is that $\beta_1$ represents the _treatment effect_ of receiving the high-fat diet. The null hypothesis that the high-fat diet has no effect can be quantified as $\beta_1 = 0$. To perform hypothesis testing on the effect of the high fat diet we can estimate $\beta_1$ and compute the probability of an estimates being as large as the observed when the null hypothesis is true. So how do we estimate $\beta_1$ and compute this probability?
 
-A powerful characteristic of linear models is that we can estimate the parameters $\beta$s and their standard errors with the same LSE machinery:
+A powerful characteristic of linear models is that we can estimate the  $\beta$s and their standard errors with the same LSE machinery:
 
 ```{r}
 fit <- lm(body_weight ~ diet, data = mice_weights)
 ```
 
-FIX Because `diet` is a factor with two entries, the `lm` function knows to fit the model above with a $x_i$ a indicator variable. The `summary` function shows us the resulting estimate, standard error, and p-value:
+Because `diet` is a factor with two entries, the `lm` function knows to fit the linear model above with a $x_i$ a indicator variable. The `summary` function shows us the resulting estimate, standard error, and p-value:
 
 ```{r}
 coefficients(summary(fit))
 ```
 
-or, using `broom`, we can write:
+Using `broom`, we can write:
 
 ```{r}
 library(broom)
 tidy(fit, conf.int = TRUE) |> filter(term == "diethf")
 ```
 
-The `statistic` computed here is the estimate divided by its standard error: $\hat{\beta}_1 / \hat{\mbox{SE}}(\hat{\beta}_1)$. In the case of the simple one-factor model, we can show that this statistic is almost equivalent to the t-test. Intuitively, it makes sense, as both $\hat{\beta_1}$ and the numerator of the t-test are estimates of the treatment effect. In fact, we can observe that the obtained number is similar to the $t$ computed in the previous section. 
+The `statistic` computed here is the estimate divided by its standard error: $\hat{\beta}_1 / \hat{\mbox{SE}}(\hat{\beta}_1)$. In the case of the simple one-factor model, we can show that this statistic is almost equivalent to the t-statistics computed in the previous section: 
 
 ```{r}
 c(coefficients(summary(fit))[2,3], t_stat)
 ```
 
+Intuitively, it makes sense, as both $\hat{\beta_1}$ and the numerator of the t-test are estimates of the treatment effect. 
 
-One minor difference is that the linear model does not assume a different standard deviation for each population. Instead, both populations share $\mbox{SD}(\varepislon)$ as a standard deviation. Note that, although we don't demonstrate it with R here, we can redefine the linear model to have different standard errors for each group.
+The one minor difference is that the linear model does not assume a different standard deviation for each population. Instead, both populations share $\mbox{SD}(\varepsilon)$ as a standard deviation. Note that, although we don't demonstrate it with R here, we can redefine the linear model to have different standard errors for each group.
 
 :::{.callout-note}
-In the linear model description provided here, we assumed $\varepsilon$ follows a normal distribution. This assumption permits us to show that the statistics formed by dividing estimates by their estimated standard errors follow t-distribution, which in turn allows us to estimate p-values or confidence intervals. However, note that we do not need this assumption to compute the expected value and standard error of the least squared estimates. Furthermore, if the number of observations is large enough, then the central limit theorem applies and we can obtain p-values and confidence intervals even without the normal distribution assumption.
+In the linear model description provided here, we assumed $\varepsilon$ follows a normal distribution. This assumption permits us to show that the statistics formed by dividing estimates by their estimated standard errors follow t-distribution, which in turn allows us to estimate p-values or confidence intervals. However, note that we do not need this assumption to compute the expected value and standard error of the least squared estimates. Furthermore, if the number of observations is large enough, then the central limit theorem applies and we can obtain p-values and confidence intervals even without the normal distribution assumption for the errors.
 :::
 
 ## Two factor designs
@@ -129,24 +146,27 @@ From examining the data:
 mice_weights |> ggplot(aes(diet, log2(body_weight), fill = sex)) + geom_boxplot()
 ```
 
-we see that the diet effect is observed for both sexes and that males are heavier than females. Although not nearly as obvious, it also appears the diet effect is stronger in males. FIX A linear model that permits a different expected value four groups, 1) female on chow diet, 2) females on high-fat diet, 3) male on chow diet, and 4 )males on high-fat diet, 
+we see that the diet effect is observed for both sexes and that males are heavier than females. Although not nearly as obvious, it also appears the diet effect is stronger in males. 
+
+A linear model that permits a different expected value for the following four groups, 1) female on chow diet, 2) females on high-fat diet, 3) male on chow diet, and 4) males on high-fat diet, can be written like this: 
 
 $$
 Y_i = \beta_1 x_{i,1} + \beta_2 x_{i,2}  + \beta_3 x_{i,3}  + \beta_4 x_{i,4}  + \varepsilon_i
 $$
-with the $x_i$s indicator variables for each of the four groups. However, with this representation, none of the $\beta$s represent the effect of interest: the diet effect. FIX Furthermore, we are now accounting for the possibility that the diet effect is different for males and females have a different, and we can test that hypothesis as well. 
 
+with $x_{i,1},\dots,x_{i,4}$ indicator variables for each of the four groups. Note that with this representation we allow the diet effect to be different for males and females. 
 
-A powerful feature of linear models is that we can rewrite the model so that we still have a different expected value for each group, but the parameters represent the effects we are interested in. So, for example, in the representation:
+However, with this representation, none of the $\beta$s represent the effect of interest: the diet effect. A powerful feature of linear models is that we can rewrite the model so that the expected value for each group remains the same, but the parameters represent the effects we are interested in. So, for example, in the representation
 
 $$
 Y_i = \beta_0 + \beta_1 x_{i,1}  + \beta_2 x_{i,2}  + \beta_3 x_{i,1} x_{i,2}  + \varepsilon_i
 $$
-FIX with $x_{i,1}$ an indicator that is one if you have the treatment and $x_{i,2}$ an indicator that is one if you are male, the $\beta_1$ can be interpreted as the treatment effect for females, $\beta_2$ as the difference between males and females, and $\beta_3$ the added treatment effect for males. In statistics, this last effect is referred to as an _interaction_ effect. The $\beta_0$ is considered the baseline value, which is the average weight of females on the chow diet. 
 
-Statistical textbooks describes several other ways in which the model can be rewritten to obtain other types of interpretations. For example, we might want $\beta_2$ to represent the average treatment effect between females and males, rather that the female treatment effects. This is achieved by defining what _contrasts_ we are interested. 
+with $x_{i,1}$ an indicator that is 1 if individual $i$ is on the high-fat diet $x_{i,2}$ an indicator that is 1 if you are male, the $\beta_1$ is interpreted as the diet effect for females, $\beta_2$ as the average difference between males and females, and $\beta_3$ the difference in the diet effect between males and females. In statistics, $\beta_3$ is referred to as an _interaction effect_. The $\beta_0$ is considered the baseline value, which is the average weight of females on the chow diet. 
+
+Statistical textbooks describe several other ways in which the model can be rewritten to obtain other types of interpretations. For example, we might want $\beta_2$ to represent the overall diet effect (the average between female and male effect) rather than the diet effect on females. This is achieved by defining what _contrasts_ we are interested in.
 
-In R, we can specific this model using the following:
+In R, we can specific the linear model above using the following:
 
 ```{r}
 fit <- lm(body_weight ~ diet*sex, data = mice_weights)
@@ -158,8 +178,7 @@ The `*` implies that the term that multiplies $x_{i,1}$ and $x_{i,2}$ should be
 tidy(fit, conf.int = TRUE) |> filter(!str_detect(term, "Intercept"))
 ```
 
-Note that the male effect is larger that the diet effect, and the diet effect is statistically significant for both sexes, with the males having a higher effect by between 1 and 4.5 grams.
-
+Note that the male effect is larger that the diet effect, and the diet effect is statistically significant for both sexes, with diet affecting males more by between 1 and 4.5 grams.
 
 A common approach applied when more than one factor is thought to affect the measurement is to simply include an additive effect for each factor, like this:
 
@@ -173,7 +192,7 @@ In this model, the $\beta_1$ is a general diet effect that applies regardless of
 fit <- lm(body_weight ~ diet + sex, data = mice_weights)
 ```
 
-FIX Because their a strong interaction effect, a diagnostic plots shows that the residuals are biased: the average negative for females on the diet and positive for the males on the diet, rather than 0.
+Note that this model does not account for the difference in diet effect between males and females. Diagnostic plots would reveal this deficiency by showing that the residuals are biased: they are, on average, negative for females on the diet and positive for males on the diet, rather than being centered around 0.
 
 ```{r lm-diagnostic-plot}
 plot(fit, which = 1)
@@ -187,9 +206,9 @@ Linear models are highly flexible and applicable in many contexts. For example,
 
 ## Contrasts
 
-In the examples we have examined, each treatment had only two groups: diet and chow, and high-fat and sex had female and male. However, variables of interest often have more than one level. For example, we might have tested a third diet on the mice. In statistics textbooks, these variables are referred to as a _factor_, and the groups in each factor are called its _levels_. 
+In the examples we have examined, each treatment had only two groups: diet had chow/high-fat, and sex had female/male. However, variables of interest often have more than one level. For example, we might have tested a third diet on the mice. In statistics textbooks, these variables are referred to as a _factor_, and the groups in each factor are called its _levels_. 
 
-When a factor is included in the formula, the default behavior for `lm` is to define the intercept term as the expected value for the first level, and the other coefficient are to represent the difference, or _contrast_, between the other levels and first. FIX We can see this if we fit estimate the sex effect with `lm`:
+When a factor is included in the formula, the default behavior for `lm` is to define the intercept term as the expected value for the first level, and the other coefficient are to represent the difference, or _contrast_, between the other levels and first. We can see when we estimate the sex effect with `lm` like this:
 
 
 ```{r}
@@ -216,13 +235,16 @@ Now, what if we really didn't want to define a reference level? What if we wante
 $$
 Y_i = \beta_0 + \beta_1 x_{i,1} + \beta_2 x_{i,2} + \varepsilon_i
 $$
-with $x_{i,1} = 1$, if observation $i$ is female and 0 otherwise, and $x_{i,2}=1$, if observation $i$ is male and 0 otherwise? Unfortunately, this representation has a problem. Note that the mean for females and males are represented by $\beta_0 + \beta_1$ and $\beta_0 + \beta_2$, respectively. This is a problem because the expected value for each group is just one number, say $\mu_f$ and $\mu_m$, and there is an infinite number of ways $\beta_0 + \beta_1 = \mu_f$ and $\beta_0 +\beta_2 = \mu_m$ (three unknowns with two equations). This implies that we can't obtain a unique least squares estimates. In statistics, we say the model, or parameters, are _unidentifiable_. The default behavior in R solves this problem by requiring $\beta_1 = 0$, forcing $\beta_0 = \mu_m$, which permits us to solve the system of equations. 
+with $x_{i,1} = 1$, if observation $i$ is female and 0 otherwise, and $x_{i,2}=1$, if observation $i$ is male and 0 otherwise? 
+
+Unfortunately, this representation has a problem. Note that the mean for females and males are represented by $\beta_0 + \beta_1$ and $\beta_0 + \beta_2$, respectively. This is a problem because the expected value for each group is just one number, say $\mu_f$ and $\mu_m$, and there is an infinite number of ways $\beta_0 + \beta_1 = \mu_f$ and $\beta_0 +\beta_2 = \mu_m$ (three unknowns with two equations). This implies that we can't obtain a unique least squares estimates. In statistics, we say the model, or parameters, are _unidentifiable_. The default behavior in R solves this problem by requiring $\beta_1 = 0$, forcing $\beta_0 = \mu_m$, which permits us to solve the system of equations. 
 
 Keep in mind that this is not the only constraint that permits estimation of the parameters. Any linear constraint will do as it adds a third equation to our system. A widely used constraint is to require $\beta_1 + \beta_2 = 0$. To achieve this in R, we can use the argument `contrast` in the following way:
 
 
 ```{r}
-fit <- lm(body_weight ~ sex, data = mice_weights, contrasts = list(sex = contr.sum))
+fit <- lm(body_weight ~ sex, data = mice_weights, 
+          contrasts = list(sex = contr.sum))
 coefficients(fit)
 ```
 
@@ -234,7 +256,7 @@ If we want to see all the estimates, the **emmeans** package also makes the calc
 contrast(emmeans(fit, ~sex))
 ```
 
-The use of this alternative constraint is more practical when a factor has more than one level, and choosing an baseline becomes less convenient. Furthermore, we might be more interested in the variance of the coefficients rather than the contrasts between groups and the reference level.
+The use of this alternative constraint is more practical when a factor has more than one level, and choosing a baseline becomes less convenient. Furthermore, we might be more interested in the variance of the coefficients rather than the contrasts between groups and the reference level.
 
 As an example, consider that the mice in our dataset are actually from several generations: 
 
@@ -249,26 +271,27 @@ $$
 Y_i = \beta_0 + \sum_{j=1}^J \beta_j x_{i,j} + \varepsilon_i
 $$
 
-with $x_{i,j}$ indicator variables: $x_{i,j}=1$ if mouse $i$ is in level $j$ and 0 otherwise, $J$ representing the number of levels, in our example `r length(unique(dat$gen))` generations, and the level effects constrained with 
+with $x_{i,j}$ indicator variables: $x_{i,j}=1$ if mouse $i$ is in level $j$ and 0 otherwise, $J$ representing the number of levels, in our example `r length(unique(mice_weights$gen))` generations, and the level effects constrained with 
 
 $$
-\frac{1}{J} \sum_{j=1}^J \beta_j = 0 \implies \sum_{j=1}^J \beta_j = 0 
+\frac{1}{J} \sum_{j=1}^J \beta_j = 0 \implies \sum_{j=1}^J \beta_j = 0. 
 $$
 
 This constraint makes the model identifiable and also allows us to quantify the variability due to generations with:
 
 $$
-\sigma^2_{\text{gen}} = \frac{1}{J}\sum_{j=1}^J \beta_j^2
+\sigma^2_{\text{gen}} \equiv \frac{1}{J}\sum_{j=1}^J \beta_j^2
 $$
 
 We can see the estimated coefficients using the following:
 
 ```{r}
-fit <- lm(body_weight ~ gen,  data = mice_weights, contrasts = list(gen = contr.sum))
+fit <- lm(body_weight ~ gen,  data = mice_weights, 
+          contrasts = list(gen = contr.sum))
 contrast(emmeans(fit, ~gen)) 
 ```
 
-In the next section, we briefly describe a technique useful to study the variability associated with this factors.
+In the next section, we briefly describe a technique useful to study the variability associated with this factor.
 
 ## Analysis of variance (ANOVA) {#sec-anova}
 
@@ -277,7 +300,7 @@ When a factor has more than one level, it is common to want to determine if ther
 
 ANOVA provides an estimate of $\sigma^2_{\text{gen}}$ and a statistical test for the null hypothesis that the factor contributes no variability:  $\sigma^2_{\text{gen}} =0$.
 
-Once a linear model is fit using one or more factors, the R `aov` function can be used to perform ANOVA. Specifically, the estimate of the factor variability is computed along with a statistic that can be used for hypothesis testing:
+Once a linear model is fit using one or more factors, the `aov` function can be used to perform ANOVA. Specifically, the estimate of the factor variability is computed along with a statistic that can be used for hypothesis testing:
 
 ```{r}
 summary(aov(fit))
@@ -295,16 +318,15 @@ We do not need to specify the constraint because ANOVA needs to constrain the su
 This analysis indicates that generation is not statistically significant.
 
 :::{.callout-note}
-We do not include many details, for example, on how the summary statistics and p-values shown by `aov` are defined and motivated. There are several books dedicated to the analysis of variance, and textbooks on linear models often include chapters on this topic. Those interested in learning more about these topics can consult these textbooks.
+We do not include many details, for example, on how the summary statistics and p-values shown by `aov` are defined and motivated. There are several books dedicated to the analysis of variance, and textbooks on linear models often include chapters on this topic. Those interested in learning more about these topics can consult one of these textbooks.
 :::
 
 
 ### Multiple factors
 
-ANOVA was developed to analyze agricultural data, which typically included several factors such as treatment, blocks, and breeds. 
+ANOVA was developed to analyze agricultural data, which typically included several factors such as fertilizers, blocks of lands, and plant breeds. 
 
-
-We can perform ANOVA with multiple factors:
+Note that we can perform ANOVA with multiple factors:
 
 ```{r}
 summary(aov(body_weight ~ sex + diet + gen,  data = mice_weights))
@@ -322,25 +344,26 @@ When the model includes more than one factor, writing down linear models can bec
 
 
 $$
-Y_i = \beta_0 + \sum_{j=1}^J \beta_j x_{i,j} + \sum_{k=1}^K \beta_{J+k} x_{i,J+k} + \varepsilon_i 
+Y_i = \beta_0 + \sum_{j=1}^J \beta_j x_{i,j} + \sum_{k=1}^K \beta_{J+k} x_{i,J+k} + \varepsilon_i \mbox{ with }\sum_{j=1}^J \beta_j=0 \mbox{ and } \sum_{k=1}^K \beta_{J+k} = 0,
 $$
-with the $x_{i,1},\dots,x_{i,J}$ indicator functions for the $J$ levels in the first factor and $x_{i,J+1},\dots,x_{i,J+K}$ indicator functions for the $K$ levels in the second factor. Remember that we would apply a constraint to assure identifiability. 
+
+the $x_{i,1},\dots,x_{i,J}$ indicator functions for the $J$ levels in the first factor and $x_{i,J+1},\dots,x_{i,J+K}$ indicator functions for the $K$ levels in the second factor.
 
 An alternative approach widely used in ANOVA to avoid indicators variables, is to save the data in an array, using different Greek letters to denote factors and indices to denote levels:
 
 $$
 Y_{i,j,k} = \mu + \alpha_j + \beta_k + \varepsilon_{i,j,k}
 $$
-with $\mu$ the overall mean, $\alpha_j$ the effect of level $j$ in the first factor, and $\beta_k$ the effect of level $k$ in the second factor. The constraint can be written as: 
+
+with $\mu$ the overall mean, $\alpha_j$ the effect of level $j$ in the first factor, and $\beta_k$ the effect of level $k$ in the second factor. The constraint can now be written as: 
 
 $$
-\sum_{j=1}^J \alph_a_j = 0 \text{ and } \sum_{k=1}^K \beta_k = 0
+\sum_{j=1}^J \alpha_j = 0 \text{ and } \sum_{k=1}^K \beta_k = 0
 $$
 
 This notation lends itself to estimating the effects by computing means across dimensions of the array.
 
 
-
 ## Exercises
 
 1\. Once you fit a model, the estimate of the standard error $\sigma$ can be obtained as follows:
@@ -436,7 +459,7 @@ b.  Because the $Y_{ij}$ are approximately normal, so are the averages.
 c.  Note that $\bar{Y}_2$ and $\bar{Y}_1$ are sample averages, so if we assume $N_2$ and $N_1$ are large enough, each is approximately normal. The difference of normally distributed variables is also normally distributed.
 d.  The data are not 0 or 1, so CLT does not apply.
 
-18\. We have constructed a random variable that has an expected value of $b_2 - b_1$, representing the difference in pollster bias. If our model holds, then this random variable has an approximately normal distribution, and we know its standard error. The standard error depends on $\sigma_1$ and $\sigma_2$, but we can plug the sample standard deviations we computed above. We began by asking: is $b_2 - b_1$ different from 0? FIX Using all the information we have gathered above, construct a 95% confidence interval for the difference $b_2$ and $b_1$.
+18\. We have constructed a random variable that has an expected value of $b_2 - b_1$, representing the difference in pollster bias. If our model holds, then this random variable has an approximately normal distribution, and we know its standard error. The standard error depends on $\sigma_1$ and $\sigma_2$, but we can plug the sample standard deviations we computed above. We began by asking: is $b_2 - b_1$ different from 0? Using all the information we have gathered above, construct a 95% confidence interval for the difference $b_2 - b_1$.
 
 19\. The confidence interval tells us there is relatively strong pollster effect resulting in a difference of about 5%. Random variability does not seem to explain it. We can compute a p-value to relay the fact that chance does not explain it. What is the p-value?
 
diff --git a/ml/algorithms.qmd b/ml/algorithms.qmd
index fc59fdf..765c358 100644
--- a/ml/algorithms.qmd
+++ b/ml/algorithms.qmd
@@ -58,7 +58,7 @@ g^{-1}(\hat{\beta}_0 + \hat{\beta}_1 x_1 + \hat{\beta}_2 x_2) = 0.5 \implies
 x_2 = -\hat{\beta}_0/\hat{\beta}_2 -\hat{\beta}_1/\hat{\beta}_2 x_1
 $$
 
-Thus, much like with regression, $x_2$ is a linear function of $x_1$. This implies that our logistic regression approach has no chance of capturing the non-linear nature of the true $p(\mathbf{x})$. FIX We now described some techniques that estimate the conditional probability in a way that is more flexible. 
+Thus, much like with regression, $x_2$ is a linear function of $x_1$. This implies that our logistic regression approach has no chance of capturing the non-linear nature of the true $p(\mathbf{x})$. We now describe some techniques that estimate the conditional probability in a more flexible way. 
 
 :::{.callout-note}
 You are ready to do exercises 1 - 11.
@@ -646,7 +646,7 @@ train_rpart <- train(y ~ .,
 y_hat <- predict(train_rpart, mnist_27$test)
 ```
 
-FIX If we use classification tree performs on the 2 or 7 example, we achieve an accuracy of `r confusionMatrix(y_hat, mnist_27$test$y)$overall["Accuracy"]` which is better than regression, but is not as good as what we achieved with kernel methods.
+If we use a classification tree on the 2 or 7 example, we achieve an accuracy of `r confusionMatrix(y_hat, mnist_27$test$y)$overall["Accuracy"]` which is better than regression, but is not as good as what we achieved with kernel methods.
 
 The plot of the estimated conditional probability shows us the limitations of classification trees:
 
@@ -662,11 +662,11 @@ grid.arrange(p2, p1, nrow = 1)
 
 Note that with decision trees, it is difficult to make the boundaries smooth since each partition creates a discontinuity.
 
-Classification trees have certain advantages that make them very useful. They are highly interpretable, even more so than linear models. They are easy to visualize (if small enough). Finally, they can model human decision processes and don't require use of dummy predictors for categorical variables. On the other hand, the approach via recursive partitioning can easily over-train and is therefore a bit harder to train than, for example, linear regression or kNN. Furthermore, in terms of accuracy, it is rarely the best performing method since it is not very flexible and is highly unstable to changes in training data. Random forests, explained next, improve on several of these shortcomings.
+Classification trees have certain advantages that make them very useful. They are highly interpretable, even more so than linear models. They are also easy to visualize (if small enough). Finally, they can model human decision processes and don't require use of dummy predictors for categorical variables. On the other hand, the approach via recursive partitioning can easily over-train and is therefore a bit harder to train than, for example, linear regression or kNN. Furthermore, in terms of accuracy, it is rarely the best performing method since it is not very flexible and is highly unstable to changes in training data. Random forests, explained next, improve on several of these shortcomings.
 
 ## Random forests
 
-Random forests are a **very popular** machine learning approach that addresses the shortcomings of decision trees using a clever idea. The goal is to improve prediction performance and reduce instability by *averaging* multiple decision trees (a forest of trees constructed with randomness). It has two features that help accomplish this.
+Random forests are a **very popular** machine learning approach that addresses the shortcomings of decision trees using a clever idea. The goal is to improve prediction performance and reduce instability by *averaging* multiple decision trees: a _forest_ of trees constructed with _randomness_. It has two features that help accomplish this.
 
 The first step is *bootstrap aggregation* or *bagging*. The general idea is to generate many predictors, each using regression or classification trees, and then forming a final prediction based on the average prediction of all these trees. To assure that the individual trees are not the same, we use the bootstrap to induce randomness. These two features combined explain the name: the bootstrap makes the individual trees **randomly** different, and the combination of trees is the **forest**. The specific steps are as follows.
 
@@ -689,7 +689,7 @@ library(randomForest)
 fit <- randomForest(margin ~ ., data = polls_2008) 
 ```
 
-Note that if we apply the function `plot` to the resulting object, stored in `fit`, we see how the error rate of our algorithm changes as we add trees:
+Note that if we apply the function `plot` to the resulting object, we see how the error rate of our algorithm changes as we add trees:
 
 ```{r, eval = FALSE}
 rafalib::mypar()
@@ -701,9 +701,15 @@ rafalib::mypar()
 plot(fit)
 ```
 
-FIX [until WE HAVE about] In this case, the accuracy improves as we add more trees until about 30 trees where accuracy stabilizes.
+In this case, the accuracy improves as we add more trees until we have used about 30 trees after which  accuracy stabilizes.
 
-The resulting estimate for this random forest can be seen like this:
+The resulting estimate for this random forest, obtained with
+
+```{r, eval=FALSE}
+y_hat <-  predict(fit, newdata = polls_2008
+```
+
+is shown with the red curve below:
 
 ```{r polls-2008-rf-fit, echo=FALSE}
 polls_2008 |>
@@ -851,7 +857,7 @@ dat <- MASS::mvrnorm(n = 100, c(69, 69), Sigma) |>
   data.frame() |> setNames(c("x", "y"))
 ```
 
-Use the __caret__ package to partition into a test and training set of equal size. Train a linear model and report the RMSE. Repeat this exercise 100 times and make a histogram of the RMSEs and report the average and standard deviation. Hint: Adapt the code shown earlier as seen below.
+Use the __caret__ package to partition into a test and training set of equal size. Train a linear model and report the RMSE. Repeat this exercise 100 times and make a histogram of the RMSEs and report the average and standard deviation. Hint: adapt the code shown earlier like this:
 
 ```{r, eval = FALSE}
 library(caret)
@@ -867,7 +873,7 @@ sqrt(mean((y_hat - test_set$y)^2))
 and put it inside a call to `replicate`.
 
 
-2\. Now we will repeat the above but using larger datasets. Repeat exercise 1 but for datasets with `n <- c(100, 500, 1000, 5000, 10000)`. Save the average and standard deviation of RMSE from the 100 repetitions. Hint: Use the `sapply` or `map` functions.
+2\. Now we will repeat the above but using larger datasets. Repeat exercise 1 but for datasets with `n <- c(100, 500, 1000, 5000, 10000)`. Save the average and standard deviation of RMSE from the 100 repetitions. Hint: use the `sapply` or `map` functions.
 
 
 3\. Describe what you observe with the RMSE as the size of the dataset becomes larger.
diff --git a/ml/clustering.qmd b/ml/clustering.qmd
index e16d49f..2f54a5f 100644
--- a/ml/clustering.qmd
+++ b/ml/clustering.qmd
@@ -33,13 +33,13 @@ d <- dist(x)
 
 ## Hierarchical clustering
 
-With the distance between each pair of movies computed, we need an algorithm to define groups from these. FIX Hierarchical clustering starts by defining each observation as a separate group, then the two closest groups are joined into a group iteratively until there is just one group including all the observations. The `hclust` function implements this algorithm and it takes a distance as input.
+With the distance between each pair of movies computed, we need an algorithm to define groups, based on these distances. Hierarchical clustering starts by defining each observation as a separate group, then the two closest groups are joined into new groups. We then continue joining the closest groups into new groups iteratively until there is just one group including all the observations. The `hclust` function implements this algorithm and takes a distance as input.
 
 ```{r}
 h <- hclust(d)
 ```
 
-FIX We can see the resulting groups using a _dendrogram_. 
+We can see the resulting groups using a _dendrogram_. The funciton `plot` applied to an `hclust` object creates a dendrogram:
 
 ```{r, eval=FALSE}
 plot(h, cex = 0.65, main = "", xlab = "")
@@ -108,9 +108,9 @@ k <- kmeans(x_0, centers = 10, nstart = 25)
 
 ## Heatmaps
 
-A powerful visualization tool for discovering clusters or patterns in your data is the heatmap. The idea is simple: plot an image of your data matrix with colors used as the visual cue and both the columns and rows ordered according to the results of a clustering algorithm. We will demonstrate this with the `tissue_gene_expression` dataset. We will scale the rows of the gene expression matrix.
+A powerful visualization tool for discovering clusters or patterns in your data is the heatmap. The idea is simple: plot an image of your data matrix with colors used as the visual cue and both the columns and rows ordered according to the results of a clustering algorithm. We will demonstrate this with the `tissue_gene_expression` dataset. 
 
-FIX The first step is compute: 
+We start by scaling the columns of the gene expression matrix because we only care about relative differences in gene expression. After scaling, we compute perform clustering on both the observations and the predictors:
 
 ```{r}
 x <- sweep(tissue_gene_expression$x, 2, colMeans(tissue_gene_expression$x))
@@ -118,24 +118,25 @@ h_1 <- hclust(dist(x))
 h_2 <- hclust(dist(t(x)))
 ```
 
-
 Now we can use the results of this clustering to order the rows and columns:
 
 ```{r heatmap, out.width="100%", fig.height=7, eval=FALSE}
 image(x[h_1$order, h_2$order])
 ```
 
-But there is `heatmap` function that does it for us:
+The `heatmap` function that does all this for us:
 
 ```{r heatmap-2, out.width="100%", fig.height=7, eval=FALSE}
 heatmap(x, col = RColorBrewer::brewer.pal(11, "Spectral"))
 ```
 
-We do not show the results of the heatmap function because there are too many features for the plot to be useful. We will therefore filter some columns and remake the plots.
+Note we do not show the results of the heatmap function because there are too many features for the plot to be useful. We will therefore filter some columns and remake the plots.
 
 ### Filtering features
 
-FIX If the information about clusters is included in just a few features, including all the features can add enough noise that detecting clusters becomes challenging. One simple approach to try to remove features with no information is to only include those with high variance. In the movie example, a user with low variance in their ratings is not really informative: all the movies seem about the same to them. Here is an example of how we can include only the features with high variance.
+If only a few features are different between clusters, including all the features can add enough noise that making cluster detection challenging. A simple approach to avoid this is to assume low variability features are not informative and include only high variance features. For example, in the movie example, users with low variance in their ratings are not really distinguishing movies: all the movies seem about the same to them. 
+
+Here is an example code showing how we can include only the features with high variance in a heatmap:
 
 ```{r heatmap-3, out.width="100%", fig.height=5, fig.width=6, message=FALSE, warning=FALSE}
 library(matrixStats)
@@ -157,5 +158,5 @@ Note there are several other heatmap functions in R. A popular example is the `h
 3\. Run a k-means clustering on the data with $K=7$. Make a table comparing the identified clusters to the actual tissue types. Run the algorithm several times to see how the answer changes.
 
 
-4\. Select the 50 most variable genes. Make sure the observations show up in the columns, that the predictors are centered, and add a color bar to show the different tissue types. Hint: Use the `ColSideColors` argument to assign colors. Also, use `col = RColorBrewer::brewer.pal(11, "RdBu")` for a better use of colors.
+4\. Select the 50 most variable genes. Make sure the observations show up in the columns, that the predictors are centered, and add a color bar to show the different tissue types. Hint: use the `ColSideColors` argument to assign colors. Also, use `col = RColorBrewer::brewer.pal(11, "RdBu")` for a better use of colors.
 
diff --git a/ml/cross-validation.qmd b/ml/cross-validation.qmd
index c4568e7..bcf5fe5 100644
--- a/ml/cross-validation.qmd
+++ b/ml/cross-validation.qmd
@@ -1,6 +1,6 @@
-# Cross validation {#sec-cross-validation}
+# Resampling methods {#sec-cross-validation}
 
-In this chapter, we introduce cross validation, one of the most important ideas in machine learning. Here we focus on the conceptual and mathematical aspects. We will describe how to implement cross validation in practice with the **caret** package later in @sec-caret-cv. To motivate the concept, we will use the two predictor digits data presented in \@ref-two-or-seven and introduce an actual machine learning algorithm, k-nearest neighbors (kNN), to demonstrate the ideas.
+In this chapter, we introduce resampling, one of the most important ideas in machine learning. Here we focus on the conceptual and mathematical aspects. We will describe how to implement resampling methods in practice with the **caret** package later in @sec-caret-cv. To motivate the concept, we will use the two predictor digits data presented in \@ref-two-or-seven and introduce k-nearest neighbors (kNN), to demonstrate the ideas.
 
 ## Motivation with k-nearest neighbors {#sec-knn-cv-intro}
 
@@ -26,7 +26,8 @@ knn_fit <- knn3(y ~ ., data = mnist_27$train, k = 5)
 
 In this case, since our dataset is balanced and we care just as much about sensitivity as we do about specificity, we will use accuracy to quantify performance.
 
-The `predict` function for `knn` produces a probability for each class. We keep the probability of being a 7 as the estimate $\hat{p}(\mathbf{x})$:
+The `predict` function for `knn3` produces a probability for each class. We can keep the probability of being a 7 as the estimate $\hat{p}(\mathbf{x})$ using `type = "prob"`. Here we 
+obtain the actual prediction using `type = "class"`:
 
 ```{r}
 y_hat_knn <- predict(knn_fit, mnist_27$test, type = "class")
@@ -72,9 +73,9 @@ confusionMatrix(y_hat_knn, mnist_27$test$y)$overall["Accuracy"]
 
 This is due to what we call *over-training*.
 
-### Over-training
+## Over-training
 
-With kNN, over-training is at its worst when we set $k = 1$. With $k = 1$, the estimate for each $\mathbf{x}$ in the training set is obtained with just the $y$ corresponding to that point. In this case, if the $x_1$ and $x_2$ are unique, we will obtain perfect accuracy in the training set because each point is used to predict itself. Remember that if the predictors are not unique and have different outcomes for at least one set of predictors, then it is impossible to predict perfectly.
+With kNN, over-training is at its worst when we set $k = 1$. With $k = 1$, the estimate for each $\mathbf{x}$ in the training set is obtained with just the $y$ corresponding to that point. In this case, if the $x_1$ and $x_2$ are unique, we will obtain perfect accuracy in the training set because each point is used to predict itself (if the predictors are not unique and have different outcomes for at least one set of predictors, then it is impossible to predict perfectly).
 
 Here we fit a kNN model with $k = 1$ and confirm we get nearer to perfect accuracy in the training set:
 
@@ -91,7 +92,8 @@ y_hat_knn_1 <- predict(knn_fit_1, mnist_27$test, type = "class")
 confusionMatrix(y_hat_knn_1, mnist_27$test$y)$overall["Accuracy"]
 ```
 
-We can see the over-fitting problem in this figure:
+We can see the over-fitting problem by plotting  the decision rule boundaries produced by $p(\mathbf{x})$:
+
 
 ```{r knn-1-overfit, echo = FALSE, out.width="100%"}
 tmp <- mnist_27$true_p
@@ -115,11 +117,10 @@ p2 <- tmp |>
 grid.arrange(p1, p2, nrow = 1)
 ```
 
-The black curves denote the decision rule boundaries.
 
-The estimate $\hat{p}(\mathbf{x})$ follows the training data too closely (left). You can see that, in the training set, boundaries have been drawn to perfectly surround a single red point in a sea of blue. Because most points $\mathbf{x}$ are unique, the prediction is either 1 or 0 and the prediction for that point is the associated label. However, once we introduce the training set (right), we see that many of these small islands now have the opposite color and we end up making several incorrect predictions.
+The estimate $\hat{p}(\mathbf{x})$ follows the training data too closely (left). You can see that, in the training set, boundaries have been drawn to perfectly surround a single red point in a sea of blue. Because most points $\mathbf{x}$ are unique, the prediction is either 1 or 0 and the prediction for that point is the associated label. However, once we introduce the test set (right), we see that many of these small islands now have the opposite color and we end up making several incorrect predictions.
 
-### Over-smoothing
+## Over-smoothing
 
 Although not as badly as with $k=1$, we saw that with $k = 5$ we also over-trained. Hence, we should consider a larger $k$. Let's try, as an example, a much larger number: $k = 401$.
 
@@ -129,7 +130,7 @@ y_hat_knn_401 <- predict(knn_fit_401, mnist_27$test, type = "class")
 confusionMatrix(y_hat_knn_401, mnist_27$test$y)$overall["Accuracy"]
 ```
 
-This turns out to be similar to regression:
+The estimate turns out to be similar to the one obtained with regression:
 
 ```{r mnist-27-glm-est, echo = FALSE, out.width="100%"}
 fit_lm <- lm(y ~ ., data = mutate(mnist_27$train, y=y == "7"))
@@ -144,11 +145,13 @@ p2 <- plot_cond_prob(predict(knn_fit_401, mnist_27$true_p)[,2]) +
 grid.arrange(p1, p2, nrow = 1)
 ```
 
-This size of $k$ is so large that it does not permit enough flexibility. We call this *over-smoothing*.
+In this case, $k$ is so large that it does not permit enough flexibility. We call this *over-smoothing*.
+
+## Parameter tuning
 
-### Parameter tuning
+It is very common for machine learning algorithms to require that we set a value, or values, before we even fit the model. An example is the $k$ in kNN. In @sec-example-alogirhms we learn of other examples. These values are referred to as _parameters_ and an important step in machine learning in practice is picking or _tunning_ those parameters.
 
-So how do we pick $k$? In principle, we want to pick the $k$ that maximizes accuracy or minimizes the expected MSE as defined in @sec-mse. The goal of cross validation is to estimate these quantities for any given algorithm and set of tuning parameters such as $k$. To understand why we need a special method to do this, let's repeat what we did above, comparing the training set and test set accuracy, but for different values of $k$. We can plot the accuracy estimates for each value of $k$:
+So how do we tune these parameters? For example, how do we pick the $k$ in kNN? In principle, we want to pick the $k$ that maximizes accuracy or minimizes the expected MSE as defined in @sec-mse. The goal of resampling methods is to estimate these quantities for any given algorithm and set of tuning parameters such as $k$. To understand why we need a special method to do this, let's repeat what we did above, comparing the training set and test set accuracy, but for different values of $k$. We can plot the accuracy estimates for each value of $k$:
 
 ```{r, echo=FALSE,warning = FALSE, message = FALSE}
 ks <- seq(3, 251, 2)
@@ -184,11 +187,11 @@ So do we simply pick the $k$ that maximizes accuracy and report this accuracy? T
 
 2.  Although for each $k$ we estimated MSE using the test set, we used the test set to pick the best $k$. As a result, we should not expect this minimum test set accuracy to extrapolate to the real world.
 
-Cross validation provides a solution to both these problems.
+Resampling methods provide a solution to both these problems.
 
-## Mathematical description of cross validation
+## Mathematical description of resampling methods
 
-In the previous section, we introduced kNN as an example to motivate the topic of this chapter. In this particular case, there is just one parameter, $k$, that affects the performance of the algorithm. However, in general, machine algorithms may have multiple parameters so we use the notation $\lambda$ to represent any set of parameters needed to define a machine learning algorithm. We also introduce notation to distinguish the predictions you get with each set of parameters with $\hat{y}(\lambda)$ and the MSE for this choice with $\text{MSE}(\lambda)$. Our goal is to find the $\lambda$ that minimizes $\text{MSE}(\lambda)$. Cross validation is a method the helps us estimate $\text{MSE}(\lambda)$.
+In the previous section, we introduced kNN as an example to motivate the topic of this chapter. In this particular case, there is just one parameter, $k$, that affects the performance of the algorithm. However, in general, machine algorithms may have multiple parameters so we use the notation $\lambda$ to represent any set of parameters needed to define a machine learning algorithm. We also introduce notation to distinguish the predictions you get with each set of parameters with $\hat{y}(\lambda)$ and the MSE for this choice with $\text{MSE}(\lambda)$. Our goal is to find the $\lambda$ that minimizes $\text{MSE}(\lambda)$. Resampling method help us estimate $\text{MSE}(\lambda)$.
 
 A intuitive first attempt is the apparent error defined in @sec-mse and used in the previous section:
 
@@ -198,17 +201,17 @@ $$
 
 As noted in the previous section, this estimate is a random error, based on just one test set, with enough variability to affect the choice of the best $\lambda$ substantially.
 
-Now imagine a world in which we could obtain data repeatedly, say from new random samples. We could take a very large number of new samples $B$, split them into training and test sets, and define: 
+Now imagine a world in which we could obtain data repeatedly, say from new random samples. We could take a very large number $B$ of new samples, split them into training and test sets, and define: 
 
 $$
 \frac{1}{B} \sum_{b=1}^B \frac{1}{N}\sum_{i=1}^N \left\{\hat{y}_i^b(\lambda) - y_i^b\right\}^2
 $$
 
-with $y_i^b$ the $i$th observation in sample $b$ and $\hat{y}_{i}^b(\lambda)$ the prediction obtained with the algorithm defined by parameter $\lambda$ and trained independently of $y_i^b$. The law of large numbers tells us that as $B$ becomes larger, this quantity gets closer and closer to $MSE(\lambda)$. This is of course a theoretical consideration as we rarely get access to more than one dataset for algorithm development, but the concept inspires the idea behind cross-validation.
+with $y_i^b$ the $i$th observation in sample $b$ and $\hat{y}_{i}^b(\lambda)$ the prediction obtained with the algorithm defined by parameter $\lambda$ and trained independently of $y_i^b$. The law of large numbers tells us that as $B$ becomes larger, this quantity gets closer and closer to $MSE(\lambda)$. This is of course a theoretical consideration as we rarely get access to more than one dataset for algorithm development, but the concept inspires the idea behind resampling methods.
 
-FIX [REPHSRASE ORACION 2. REPITES 'THE GENERAL IDEA'] The general idea is to generate a series of different random samples from the data at hand. There are several approaches to doing this, but the general idea for all to randomly generate several smaller datasets that are not used for training, and instead are used to estimate MSE. 
+The general idea is to generate a series of different random samples from the data at hand. There are several approaches to doing this, but all randomly generate several smaller datasets that are not used for training, and instead are used to estimate MSE. Next, we describe _cross validation_, one of the most widely used resampling resampling methods.
 
-## Cross validation in practice
+## Cross validation 
 
 ```{r, include = FALSE}
 if (knitr::is_html_output()) {
@@ -249,12 +252,12 @@ We will do this many times assuring that the estimates of MSE obtained in each d
 As a reminder, we are going to imitate the concept used when introducing this version of the MSE:
 
 $$
-\mbox{MSE}(\lambda) = \frac{1}{B} \sum_{b = 1}^B \frac{1}{N}\sum_{i = 1}^N \left(\hat{y}_i^b(\lambda) - y_i^b\right)^2 
+\mbox{MSE}(\lambda) \approx\frac{1}{B} \sum_{b = 1}^B \frac{1}{N}\sum_{i = 1}^N \left(\hat{y}_i^b(\lambda) - y_i^b\right)^2 
 $$
 
-We want to generate datasets that can be thought of as an independent random sample and we want to do this $B$ times. With K-fold cross validation, we do it $K$ times. In the illustrations, we are showing an example that uses $K = 5$.
+We want to generate a dataset that can be thought of as independent random sample, and do this $B$ times. The K in K-fold cross validation, represents the number of time $B$. In the illustrations, we are showing an example that uses $B = 5$.
 
-We will eventually end up with $K$ samples, but let's start by describing how to construct the first: we simply pick $M = N/K$ observations at random (we round if $M$ is not a round number) and think of these as a random sample $y_1^b, \dots, y_M^b$, with $b = 1$. We call this the validation set.
+We will eventually end up with $B$ samples, but let's start by describing how to construct the first: we simply pick $M = N/B$ observations at random (we round if $M$ is not a round number) and think of these as a random sample $y_1^b, \dots, y_M^b$, with $b = 1$. We call this the validation set.
 
 Now we can fit the model in the training set, then compute the apparent error on the independent set:
 
@@ -262,25 +265,25 @@ $$
 \hat{\mbox{MSE}}_b(\lambda) = \frac{1}{M}\sum_{i = 1}^M \left(\hat{y}_i^b(\lambda) - y_i^b\right)^2 
 $$
 
-As a reminder, this is just one sample and will therefore return a noisy estimate of the true error.  In K-cross validation, we randomly split the observations into $K$ non-overlapping sets:
+As a reminder, this is just one sample and will therefore return a noisy estimate of the true error.  In K-fold cross validation, we randomly split the observations into $B$ non-overlapping sets:
 
 ```{r, echo = FALSE}
 knitr::include_graphics("img/cv-5.png")
 ```
 
-Now we repeat the calculation above for each of these sets $b = 1,\dots,K$ and obtain $\hat{\mbox{MSE}}_1(\lambda),\dots, \hat{\mbox{MSE}}_K(\lambda)$. Then, for our final estimate, we compute the average:
+Now we repeat the calculation above for each of these sets $b = 1,\dots,B$ and obtain $\hat{\mbox{MSE}}_1(\lambda),\dots, \hat{\mbox{MSE}}_B(\lambda)$. Then, for our final estimate, we compute the average:
 
 $$
-\hat{\mbox{MSE}}(\lambda) = \frac{1}{K} \sum_{b = 1}^K \hat{\mbox{MSE}}_b(\lambda)
+\hat{\mbox{MSE}}(\lambda) = \frac{1}{B} \sum_{b = 1}^B \hat{\mbox{MSE}}_b(\lambda)
 $$
 
 and obtain an estimate of our loss. A final step would be to select the $\lambda$ that minimizes the MSE.
 
 ### How many folds?
 
-Now how do we pick the cross validation $K$? Large values of $K$ are preferable because the training data better imitates the original dataset. However, larger values of $K$ will have much slower computation time: for example, 100-fold cross validation will be 10 times slower than 10-fold cross validation. For this reason, the choices of $K = 5$ and $K = 10$ are popular.
+Now how do we pick the cross validation fold? Large values of $B$ are preferable because the training data better imitates the original dataset. However, larger values of $B$ will have much slower computation time: for example, 100-fold cross validation will be 10 times slower than 10-fold cross validation. For this reason, the choices of $B = 5$ and $B = 10$ are popular.
 
-One way we can improve the variance of our final estimate is to take more samples. To do this, we would no longer require the training set to be partitioned into non-overlapping sets. Instead, we would just pick $K$ sets of some size at random.
+One way we can improve the variance of our final estimate is to take more samples. To do this, we would no longer require the training set to be partitioned into non-overlapping sets. Instead, we would just pick $B$ sets of some size at random.
 
 ### Estimate MSE of our optimized algorithm
 
@@ -309,19 +312,19 @@ knitr::include_graphics("img/cv-8.png")
 knitr::opts_chunk$set(out.width = "70%", out.extra = NULL)
 ```
 
-### Boostrap in cross-validation
+## Boostrap resampling
 
-Typically, cross-validation involves partitioning the original dataset into a training set to train the model and a testing set to evaluate it. With the bootstrap approach, based on the ideas described in @sec-bootstrap, you can create multiple different training datasets via bootstrapping. This method is sometimes called bootstrap aggregating or bagging. FIX [WHERE DO YOU PROVIDE THIS? BELOW? or "we will now"?] We provide a step-by-step description.
+Typically, cross-validation involves partitioning the original dataset into a training set to train the model and a testing set to evaluate it. With the bootstrap approach, based on the ideas described in @sec-bootstrap, you can create multiple different training datasets via bootstrapping. This method is sometimes called bootstrap aggregating or bagging. 
 
-From the original dataset, we create a large number of bootstrap samples. Each bootstrap sample is created by randomly selecting observations with replacement, usually the same size as the original dataset. For each bootstrap sample, fit the model and compute the MSE estimate on the observations not selected in the random sampling, referred to as the _out-of-bag observations_. These out-of-bag observations serve a similar role to a test set in standard cross-validation.
+In bootstrap resampling, we create a large number of bootstrap samples from the original training dataset. Each bootstrap sample is created by randomly selecting observations with replacement, usually the same size as the original training dataset. For each bootstrap sample, we fit the model and compute the MSE estimate on the observations not selected in the random sampling, referred to as the _out-of-bag observations_. These out-of-bag observations serve a similar role to a validation set in standard cross-validation.
 
 We then average the MSEs obtained in the out-of-bag observations from each bootstrap sample to estimate the model's performance.
 
-This approach is actually the default approach in the **caret** package. We describe how to implement cross validation with the **caret** package in the next chapter. In the next section, we include an explanation of how the bootstrap works in general.
+This approach is actually the default approach in the **caret** package. We describe how to implement resampling methods with the **caret** package in the next chapter. 
 
 ### Comparison of MSE estimates {#sec-mse-estimates}
 
-In @sec-knn-cv-intro, we computed an estimate of MSE based just on the provided test set (shown in red in the plot below). Here we show how the cross-validation techniques described above help reduce variability. The green curve below shows the results of apply K-fold cross validation with 10 folds, leaving out 10% of the data for validation. We can see that the variance is reduced substantially. The blue curve is the result of using 100 bootstrap samples to estimate MSE. The variability is reduced even further, but at the cost of increasing computation time by 10 fold.
+In @sec-knn-cv-intro, we computed an estimate of MSE based just on the provided test set (shown in red in the plot below). Here we show how the cross-validation techniques described above help reduce variability. The green curve below shows the results of applying K-fold cross validation with 10 folds, leaving out 10% of the data for validation. We can see that the variance is reduced substantially. The blue curve is the result of using 100 bootstrap samples to estimate MSE. The variability is reduced even further, but at the cost of a 10 fold increase in computation time.
 
 ```{r}
 set.seed(2023-11-30)
@@ -344,8 +347,6 @@ data.frame(k = ks, naive = accuracy["test",],
   geom_line() 
 ```
 
-
-
 ## Exercises
 
 Generate a set of random predictors and outcomes like this:
@@ -361,7 +362,7 @@ y <- rbinom(n, 1, 0.5) |> factor()
 x_subset <- x[ ,sample(p, 100)]
 ```
 
-1\. Because `x` and `y` are completely independent, you should not be able to predict `y` using `x` with accuracy larger than 0.5. Confirm this by running cross validation using logistic regression to fit the model. Because we have so many predictors, we selected a random sample `x_subset`. Use the subset when training the model. Hint: Use the caret `train` function. The `results` component of the output of `train` shows you the accuracy. Ignore the warnings.
+1\. Because `x` and `y` are completely independent, you should not be able to predict `y` using `x` with accuracy larger than 0.5. Confirm this by running cross validation using logistic regression to fit the model. Because we have so many predictors, we selected a random sample `x_subset`. Use the subset when training the model. Hint: use the caret `train` function. The `results` component of the output of `train` shows you the accuracy. Ignore the warnings.
 
 2\. Now instead of a random selection of predictors, we are going to search for those that are most predictive of the outcome. We can do this by comparing the values for the $y = 1$ group to those in the $y = 0$ group, for each predictor, using a t-test. You can perform this step as follows:
 
diff --git a/ml/evaluation-metrics.qmd b/ml/evaluation-metrics.qmd
index 2181641..3d0206a 100644
--- a/ml/evaluation-metrics.qmd
+++ b/ml/evaluation-metrics.qmd
@@ -2,12 +2,15 @@
 
 Before we start describing approaches to optimize the way we build algorithms, we first need to define what we mean when we say one approach is better than another. In this section, we focus on describing ways in which machine learning algorithms are evaluated. Specifically, we need to quantify what we mean by "better".
 
-For our first introduction to machine learning concepts, we will start with a boring and simple example: how to predict sex using height. As we explain machine learning step by step, this example will let us set down the first building block. Soon enough, we will be undertaking more interesting challenges. FIX We use the __caret__ package, which has several useful functions for building and assessing machine learning methods and we introduce in more detail in @sec-caret, and for a first example, we use the height data in dslabs.
+For our first introduction to machine learning concepts, we will start with a boring and simple example: how to predict sex using height. As we explain how to build a prediction algorithm with this example, we will start to set down the first building block needed to understand machine learning. Soon enough, we will be undertaking more interesting challenges. 
+
+
+We introduce the **caret** package, which provides useful functions to facilitate machine learning in R, and we describe it in more detail in @sec-caret. For our first example, we use the height data provided by the **dslabs** package.
 
 ```{r, message=FALSE, warning=FALSE, cache=FALSE}
-library(tidyverse)
 library(caret)
 library(dslabs)
+
 ```
 
 We start by defining the outcome and predictors. 
@@ -70,8 +73,9 @@ Not surprisingly, our accuracy is about 50%. We are guessing!
 
 Can we do better? Exploratory data analysis suggests we can because, on average, males are slightly taller than females:
 
-```{r}
-heights |> group_by(sex) |> summarize(mean(height), sd(height))
+```{r, warning=FALSE, message=FALSE}
+library(tidyverse)
+heights |> group_by(sex) |> summarize(avg = mean(height), sd = sd(height))
 ```
 
 But how do we make use of this insight? Let's try another simple approach: predict `Male` if height is within two standard deviations from the average male. 
@@ -92,7 +96,7 @@ Here we examine the accuracy of 10 different cutoffs and pick the one yielding t
 
 ```{r}
 cutoff <- seq(61, 70)
-accuracy <- map_dbl(cutoff, function(x){
+accuracy <- sapply(cutoff, function(x){
   y_hat <- factor(ifelse(train_set$height > x, "Male", "Female"), levels = levels(test_set$sex))
   mean(y_hat == train_set$sex)
 })
@@ -252,7 +256,7 @@ Let's rebuild our prediction algorithm, but this time maximizing the F-score ins
 
 ```{r}
 cutoff <- seq(61, 70)
-F_1 <- map_dbl(cutoff, function(x){
+F_1 <- sapply(cutoff, function(x){
   y_hat <- factor(ifelse(train_set$height > x, "Male", "Female"), levels(test_set$sex))
   F_meas(data = y_hat, reference = factor(train_set$sex))
 })
@@ -334,13 +338,12 @@ The ROC curve plots sensitivity, represented as the TPR, versus 1 - specificity
 
 ```{r roc-1}
 probs <- seq(0, 1, length.out = 10)
-guessing <- map_df(probs, function(p){
+guessing <- sapply(probs, function(p){
   y_hat <- 
-    sample(c("Male", "Female"), nrow(test_set), replace = TRUE, prob = c(p, 1 - p)) |> 
+    sample(c("Male", "Female"), nrow(test_set), TRUE, c(p, 1 - p)) |> 
     factor(levels = c("Female", "Male"))
-  list(method = "Guessing",
-       FPR = 1 - specificity(y_hat, test_set$sex),
-       TPR = sensitivity(y_hat, test_set$sex))
+  c(FPR = 1 - specificity(y_hat, test_set$sex),
+    TPR = sensitivity(y_hat, test_set$sex))
 })
 ```
 
@@ -349,12 +352,11 @@ We can use similar code to compute these values for our our second approach. By
 
 ```{r, echo=FALSE}
 cutoffs <- c(50, seq(60, 75), 80)
-height_cutoff <- map_df(cutoffs, function(x){
+height_cutoff <- sapply(cutoffs, function(x){
   y_hat <- ifelse(test_set$height > x, "Male", "Female") |> 
     factor(levels = c("Female", "Male"))
-   list(method = "Height cutoff",
-        FPR = 1 - specificity(y_hat, test_set$sex),
-        TPR = sensitivity(y_hat, test_set$sex))
+  c(FPR = 1 - specificity(y_hat, test_set$sex),
+    TPR = sensitivity(y_hat, test_set$sex))
 })
 ```
 
@@ -436,7 +438,7 @@ bind_rows(tmp_1, tmp_2) |>
   facet_wrap(~ Positive)
 ```
 
-From this plot, we immediately see that the precision of guessing is not high. This is because the prevalence is low. FIX [SHOULD MALE AND FEMALE BE IN CAPS?] We also see that if we change positives to mean male instead of female, the ROC curve remains the same, but the precision recall plot changes.
+From the plot on the left, we immediately see that the precision of guessing is not high. This is because the prevalence is low.  From the plot on the right, we also see that if we change $Y=1$ to mean `Male` instead of `Female`, the precision increases. Note that the ROC curve would remain the same.
 
 
 ## Mean Squared Error {#sec-mse}
@@ -472,9 +474,9 @@ In practice, we often report the root mean squared error (RMSE), which is simply
 :::
 
 However, the estimate $\hat{\text{MSE}}$ is a random variable. In fact, $\text{MSE}$ and $\hat{\text{MSE}}$ are often referred to as the true error and apparent error, respectively.
-Due to the complexity of some machine learning, it is difficult to derive its statistical properties of how well the apparent error estimates the true error. In @sec-cross-validation, we introduce cross-validation and FIX an approach to estimating the MSE that get closer the true MSE. 
+Due to the complexity of some machine learning, it is difficult to derive the statistical properties of how well the apparent error estimates the true error. In @sec-cross-validation, we introduce cross-validation an approach to estimating the MSE. 
 
-We end the chapter by pointing out that there are loss functions other than the squared loss. For example, the _Mean Absolute Error_ uses absolute values, $|\hat{Y}_i - Y_i|$ instead of squaring the errors 
+We end this chapter by pointing out that there are loss functions other than the squared loss. For example, the _Mean Absolute Error_ uses absolute values, $|\hat{Y}_i - Y_i|$ instead of squaring the errors 
 $(\hat{Y}_i - Y_i)^2$. However, in this book we focus on minimizing square loss since it is the most widely used.
 
 ## Exercises 
@@ -511,4 +513,5 @@ y <- factor(dat$sex, c("Female", "Male"))
   
 6\. What is the prevalence (% of females) in the `dat` dataset defined above?
 
+
   
\ No newline at end of file
diff --git a/ml/ml-in-practice.qmd b/ml/ml-in-practice.qmd
index 5ac2e13..659b717 100644
--- a/ml/ml-in-practice.qmd
+++ b/ml/ml-in-practice.qmd
@@ -66,11 +66,11 @@ train_qda <- train(y ~ ., method = "qda", data = mnist_27$train)
 train_knn <- train(y ~ ., method = "knn", data = mnist_27$train)
 ```
 
-FIX As we explain in more detail in #sec-caret-cv, the train function selects parameters for you using cross validation.
+As we explain in more detail in @sec-caret-cv, the `train` function selects parameters for you using a resampling method, with boostrap as the default.
 
 ### The `predict` function
 
-The `predict` function is very useful for machine learning applications. This function takes an object from a fitting function and a data frame with features \mathbf{x} for which to predict, and returns predictions for these features. 
+The `predict` function is very useful for machine learning applications. This function takes an object from a fitting function and a data frame with features $\mathbf{x}$ for which to predict, and returns predictions for these features. 
 
 Here is an example with logistic regression:
 
@@ -101,7 +101,7 @@ However, note that `predict` does not always return objects of the same types; i
 
 There are many other versions of `predict` and many machine learning algorithms have a `predict` function.
 
-As with `train`, the **caret** packages unifies the use of `predict` with the function `predict.train`. This function takes the output of `train` and produces prediction of categories or estimates of $p(\mathbf{x}).
+As with `train`, the **caret** packages unifies the use of `predict` with the function `predict.train`. This function takes the output of `train` and produces prediction of categories or estimates of $p(\mathbf{x})$.
 
 The code looks the same for all methods:
 
@@ -201,9 +201,9 @@ You can learn many more details about the **caret** package, from the manual ^[h
 
 We often transform predictors before running the machine algorithm. We also remove predictors that are clearly not useful. We call these steps *preprocessing*.
 
-Examples of preprocessing include standardizing the predictors, taking the log transform of some predictors, removing predictors that are highly correlated with others, and removing predictors with very few non-unique values or close to zero variation. FIX DO YOU NEED THIS SENT: We show an example below.
+Examples of preprocessing include standardizing the predictors, taking the log transform of some predictors, removing predictors that are highly correlated with others, and removing predictors with very few non-unique values or close to zero variation. 
 
-For example, we can run the `nearZero` function from the **caret** package to see that several features do not vary much from observation to observation. We can see that there is a large number of features with 0 variability:
+For example, we can run the `nearZero` function from the **caret** package to see that several features do not vary much from observation to observation. We can see that there is a large number of features with close to 0 variability:
 
 ```{r pixel-sds, message=FALSE, warning=FALSE, cache=FALSE}
 library(matrixStats)
@@ -219,7 +219,7 @@ The **caret** packages includes a function that recommends features to be remove
 nzv <- nearZeroVar(x)
 ```
 
-We can see the columns recommended for removal:
+We can see the columns recommended for removal are the near the edges:
 
 ```{r, eval=FALSE}
 image(matrix(1:784 %in% nzv, 28, 28))
@@ -312,10 +312,11 @@ y_hat_knn_pca <- predict(fit_knn_pca, newdata, type = "class")
 confusionMatrix(y_hat_knn_pca, factor(y_test))$overall["Accuracy"]
 ```
 
-FIX With obtain an improvement in accuracy, while using only 36 dimensions. 
+We obtain an improvement in accuracy, while using only 36 dimensions. 
 
 :::{.callout-note}
-Remember the entire algorithm needs to be developed on the training data. FIX This is why the column means subtracted from test set columns were those estimated in the training set and the rotation applied to the test data to obtained PCs we the one obtained with the train data. 
+Remember the entire algorithm needs to be developed on the training data. 
+This is why the column means, subtracted from test set columns, and the rotation applied to obtained PCs, were both computed using the train data. 
 :::
 
 
@@ -550,7 +551,7 @@ What is the variable importance in the random forest call for these predictors?
 
 
 
-21\. FIX Previously, we have compared conditional probability give two predictors $p(x_1,x_2)$ to the fit $\hat{p}(x_1,x_2)$ obtained with a machine learning algorithm by making image plots. The following code can be used to make these images and include a curve at the values of $x_1$ and $x_2$ for which the function is $0.5$:
+21\. Previously, we compared the conditional probability $p(\mathbf{x})$ give two predictors $\mathbf{x} = (x_1, x_2)^\top$ to the fit $\hat{p}(\mathbf{x})$ obtained with a machine learning algorithm by making image plots. The following code can be used to make these images and include a curve at the values of $x_1$ and $x_2$ for which the function is $0.5$:
 
 ```{r}
 #| eval: false
@@ -607,9 +608,9 @@ We have not explained many of these, but apply them anyway using `train` with al
 
 30\. Now let's only consider the methods with an estimated accuracy of 0.8 when constructing the ensemble. What is the accuracy now?
 
-31\. FIX If two methods give results that are the same, ensembling them will not change the results at all. For each pair of metrics compare the percent of time they call the same thing. Then use the `heatmap` function to visualize the results. Hint: Use the `method = "binary"` argument in the `dist` function.
+31\. Note that if two machine algorithms methods predict the same outcome, ensembling them will not change the prediction. For each pair of algorithms compare the percent of observations for which they make the same prediction. Use this to define a function and then use the `heatmap` function to visualize the results. Hint: use the `method = "binary"` argument in the `dist` function.
 
-32\. Note that each method can also produce an estimated conditional probability. Instead of majority vote, we can take the average of these estimated conditional probabilities. For most methods, we can the use the `type = "prob"` in the train function. Note that some of the methods require you to use the argument `trControl=trainControl(classProbs=TRUE)` when calling train. Also, these methods do not work if classes have numbers as names. Hint: Change the levels as show below.
+32\. Note that each method can also produce an estimated conditional probability. Instead of majority vote, we can take the average of these estimated conditional probabilities. For most methods, we can the use the `type = "prob"` in the train function. Note that some of the methods require you to use the argument `trControl=trainControl(classProbs=TRUE)` when calling train. Also, these methods do not work if classes have numbers as names. Hint: change the levels like this:
 
 ```{r, eval = FALSE}
 dat$train$y <- recode_factor(dat$train$y, "2"="two", "7"="seven")
diff --git a/ml/notation-and-terminology.qmd b/ml/notation-and-terminology.qmd
index 338ead8..ed62984 100644
--- a/ml/notation-and-terminology.qmd
+++ b/ml/notation-and-terminology.qmd
@@ -1,6 +1,6 @@
 # Notation and terminology
 
-In Section \ref{@mnist}, we introduced the MNIST handwritten digits dataset. Here we describe how the task of automatically reading these digits can be framed as a machine learning challenge. In doing so, we introduce machine learning mathematical notation and terminology used throughout this part of the book.
+In \ref{@mnist}, we introduced the MNIST handwritten digits dataset. Here we describe how the task of automatically reading these digits can be framed as a machine learning challenge. In doing so, we introduce machine learning mathematical notation and terminology used throughout this part of the book.
 
 Originally, mail sorting in the post office involved humans reading zip codes written on the envelopes. Today, thanks to machine learning algorithms, a computer can read zip codes and then a robot sorts the letters. We will learn how to build algorithms that can read a digitized handwritten digit.
 
diff --git a/ml/smoothing.qmd b/ml/smoothing.qmd
index c9232c9..2d2f9d4 100644
--- a/ml/smoothing.qmd
+++ b/ml/smoothing.qmd
@@ -22,9 +22,9 @@ Part of what we explain in this section are the assumptions that permit us to ex
 
 ## Example: Is it a 2 or a 7? {#sec-two-or-seven}
 
-To motivate the need for smoothing and make the connection with machine learning, we will construct a simplified version of the MNIST dataset with just two classes for the outcome and two predictors. Specifically, we define the challenge as building an algorithm that can determine if a digit is a 2 or 7 from the proportion of dark pixels in the upper left quadrant ($X_1$) and the lower right quadrant ($X_2$). We also selected a random sample of 1,000 digits, 500 in the training set and 500 in the test set. FIX We provide this dataset in the `dslabs` package:
+To motivate the need for smoothing and make the connection with machine learning, we will construct a simplified version of the MNIST dataset with just two classes for the outcome and two predictors. Specifically, we define the challenge as building an algorithm that can determine if a digit is a 2 or 7 from the proportion of dark pixels in the upper left quadrant ($X_1$) and the lower right quadrant ($X_2$). We also selected a random sample of 1,000 digits, 500 in the training set and 500 in the test set. We provide this dataset in the _mnist_27_ object in the `dslabs` package.
 
-So for the training data, we have $n=500$ observed outcomes $y_1,\dots,y_n$, with $Y$ defined as $1$ if the digit is 7 and 0 if it's 2, and $n=500$ features $\matbh{x}_1, \dots, \matbh{x}_n$, with each feature a two-dimensional point $\mathbf{x}_i = (x_{i,1}, x_{i,2})^\top$. Here is a plot of the $x_2$s versus the $x_1$s with color determining if $y$ is 1 (blue) or 0 (red):
+For the training data, we have $n=500$ observed outcomes $y_1,\dots,y_n$, with $Y$ defined as $1$ if the digit is 7 and 0 if it's 2, and $n=500$ features $\mathbf{x}_1, \dots, \mathbf{x}_n$, with each feature a two-dimensional point $\mathbf{x}_i = (x_{i,1}, x_{i,2})^\top$. Here is a plot of the $x_2$s versus the $x_1$s with color determining if $y$ is 1 (blue) or 0 (red):
 
 ```{r two-or-seven-scatter, warning=FALSE, message=FALSE, cache=FALSE}
 library(caret)
@@ -549,7 +549,7 @@ with $x_1$ and $x_2$ the two predictors defined in @sec-two-or-seven. In this ex
 
 ## Exercises
 
-1\. FIX In the wrangling part of this book, we used the code below to obtain mortality counts for Puerto Rico for 2015-2018.
+1\. The **dslabs** package provides the following dataset with mortality counts for Puerto Rico for 2015-2018.
 
 ```{r, eval=FALSE}
 library(dslabs)
diff --git a/prob/continuous-probability.qmd b/prob/continuous-probability.qmd
index dfaeb0e..c617235 100644
--- a/prob/continuous-probability.qmd
+++ b/prob/continuous-probability.qmd
@@ -9,13 +9,13 @@ We used the heights of adult male students as an example:
 ```{r,  message=FALSE, warning=FALSE}
 library(tidyverse)
 library(dslabs)
-x <- heights %>% filter(sex=="Male") %>% pull(height)
+x <- heights %>% filter(sex == "Male") %>% pull(height)
 ```
 
 and defined the empirical cumulative distribution function (eCDF) as
 
 ```{r}
-F <- function(a) mean(x<=a)
+F <- function(a) mean(x <= a)
 ```
 
 which, for any value `a`, gives the proportion of values in the list `x` that are smaller or equal than `a`.
@@ -44,12 +44,14 @@ A mathematical result that is very useful in practice is that, for most CDFs, we
 
 $$
 F(b) - F(a) = \int_a^b f(x)\,dx
-$$ $f(x)$ is referred to as the *probability density function*. The intuition is that even for continuous outcomes we can define tiny intervals, that are almost as small as points, that have positive probabilities. If we think of the size of these intervals as the base of a rectangle, the probability density function $f$ determines the height of the rectangle so that the summing up of the area of these rectangles approximate the probability $F(b) - F(a)$. This sum can be written as Reimann sum that is approximated by an integral:
+$$ 
+
+$f(x)$ is referred to as the *probability density function*. The intuition is that even for continuous outcomes we can define tiny intervals, that are almost as small as points, that have positive probabilities. If we think of the size of these intervals as the base of a rectangle, the probability density function $f$ determines the height of the rectangle so that the summing up of the area of these rectangles approximate the probability $F(b) - F(a)$. This sum can be written as Reimann sum that is approximated by an integral:
 
 ```{r echo=FALSE}
-cont <- data.frame(x=seq(0,5, len = 300), y=dgamma(seq(0,5, len = 300), 2, 2))
-disc <- data.frame(x=seq(0, 5, 0.075), y=dgamma(seq(0, 5, 0.075), 2, 2))
-ggplot(mapping = aes(x,y)) +
+cont <- data.frame(x = seq(0, 5, len = 300), y = dgamma(seq(0, 5, len = 300), 2, 2))
+disc <- data.frame(x = seq(0, 5, 0.075), y = dgamma(seq(0, 5, 0.075), 2, 2))
+ggplot(mapping = aes(x, y)) +
   geom_col(data =  disc) +
   geom_line(data = cont) +
   ylab("f(x)")
@@ -105,7 +107,7 @@ pnorm(70.5, m, s) - pnorm(69.5, m, s)
 However, the approximation is not as useful for other intervals. For instance, notice how the approximation breaks down when we try to estimate:
 
 ```{r}
-mean(x <= 70.9) - mean(x<=70.1)
+mean(x <= 70.9) - mean(x <= 70.1)
 ```
 
 with:
@@ -124,7 +126,9 @@ $$
 \mbox{Pr}(X=4) = 1/6
 $$
 
-The CDF can then easily be defined: $$
+The CDF can then easily be defined: 
+
+$$
 F(4) = \mbox{Pr}(X\leq 4) =  \mbox{Pr}(X = 4) +  \mbox{Pr}(X = 3) +  \mbox{Pr}(X = 2) +  \mbox{Pr}(X = 1) 
 $$
 
@@ -145,10 +149,10 @@ For example, to use the normal approximation to estimate the probability of some
 which mathematically is the grey area below:
 
 ```{r intergrals, echo=FALSE}
-dat <- tibble(x = seq(-4, 4, length=100) * s + m,
+dat <- tibble(x = seq(-4, 4, length = 100)*s + m,
               y = dnorm(x, m, s))
 
-dat_ribbon <- filter(dat, x >= 2 * s + m)
+dat_ribbon <- filter(dat, x >= 2*s + m)
 
 ggplot(dat, aes(x, y)) +
   geom_line() +
@@ -175,7 +179,7 @@ Not surprisingly, the distribution looks normal:
 ```{r simulated-heights, echo=FALSE}
 data.frame(simulated_heights = simulated_heights) |>
   ggplot(aes(simulated_heights)) + 
-  geom_histogram(color="black", binwidth = 1) 
+  geom_histogram(color = "black", binwidth = 1) 
 ```
 
 This is one of the most useful functions in R, as it will permit us to generate data that mimics natural events and answers questions related to what could happen by chance by running Monte Carlo simulations.
@@ -200,7 +204,7 @@ Here is the resulting distribution:
 
 ```{r simulated-tallest-height, echo=FALSE}
 data.frame(tallest = tallest) |> ggplot(aes(tallest)) + 
-  geom_histogram(color="black", binwidth = 1) 
+  geom_histogram(color = "black", binwidth = 1) 
 ```
 
 Note that it does not look normal.
@@ -237,18 +241,18 @@ $$
 Now we subtract $\mu$ to both sides and then divide both sides by $\sigma$:
 
 $$
-\mbox{Pr}\left(\frac{X-m}{s} \leq \frac{a-m}{s} \right)
+\mbox{Pr}\left(\frac{X-\mu}{\sigma} \leq \frac{a-\mu}{\sigma} \right)
 $$
 
 The quantity on the left is a standard normal random variable. It has an average of 0 and a standard error of 1. We will call it $Z$:
 
 $$
-\mbox{Pr}\left(Z \leq \frac{a-m}{s} \right)
+\mbox{Pr}\left(Z \leq \frac{a-\mu}{\sigma} \right)
 $$
 
-FIX So, no matter the units, the probability of $X\leq a$ is the same as the probability of a standard normal variable being less than $(a - m)/s$. If `mu` is the average and `sigma` the standard error, which of the following R code would give us the right answer in every situation?
+So, no matter the units, the probability of $X\leq a$ is the same as the probability of a standard normal variable being less than $(a - \mu)/\sigma$. If `mu` is the average and `sigma` the standard error, which of the following R code would give us the right answer in every situation?
 
-a.  `mean(X<=a)`
+a.  `mean(X <= a)`
 b.  `pnorm((a - m)/s)`
 c.  `pnorm((a - m)/s, m, s)`
 d.  `pnorm(a)`
diff --git a/prob/discrete-probability.qmd b/prob/discrete-probability.qmd
index 2aa9c05..6421086 100644
--- a/prob/discrete-probability.qmd
+++ b/prob/discrete-probability.qmd
@@ -1,6 +1,6 @@
 # Discrete probability
 
-We begin by covering some basic principles related to categorical data. FIX CHECK ENTIRE PARA I TWEAKED The specific area of probability which deals with categorical data is referred to as *discrete probability*. Understanding this topic, will help us comprehend the probability theory we will later introduce for numeric and continuous data, which is much more common in data science applications. Since discrete probability is more useful in card games, we will use these as illustrative examples.
+We begin by covering some basic principles related to categorical data. The specific area of probability which deals with categorical data is referred to as *discrete probability*. Understanding this topic, will help us comprehend the probability theory we will later introduce for numeric and continuous data, which is much more common in data analysis. Since discrete probability is invaluable in card games, we will use these as illustrative examples.
 
 ## Relative frequency
 
@@ -74,7 +74,7 @@ prop.table(tab)
 
 The numbers above represent the estimated probabilities obtained by this Monte Carlo simulation. Statistical theory, not covered here, tells us that as $B$ gets larger, the estimates get closer to 3/5=.6 and 2/5=.4.
 
-FIX EXACT WHAT? AT THE END Although this is a simple and not very useful example, we will use Monte Carlo simulations to estimate probabilities in cases in which it is harder to compute the exact ones. Before delving into more complex examples, we use simple ones to demonstrate the computing tools available in R.
+This is a simple and not very useful example, since we can easily compute the probabilities mathematically. Monte Carlo simulations are useful when it is hard, or impossible, to compute the exact probabilities mathematically. Before delving into more complex examples, we use simple ones to demonstrate the computing tools available in R.
 
 ### Setting the random seed
 
@@ -189,7 +189,7 @@ $$
 \mbox{Pr}(A \mbox{ and } B \mbox{ and } C) = \mbox{Pr}(A)\mbox{Pr}(B)\mbox{Pr}(C)
 $$
 
-FIX WHAT IS 'THIS' THE MULT RULE? However, we have to be very careful before using this, since assuming independence can result in very different and incorrect probability calculations when independence does not actually exist.
+However, we have to be very careful before using this version of the multiplication rule, since assuming independence can result in very different and incorrect probability calculations when events are not actually independent.
 
 As an example, imagine a court case in which the suspect was described as having a mustache and a beard. The defendant has a mustache and a beard, and the prosecution brings in an "expert" to testify that 1/10 men have beards and 1/5 have mustaches. Using the multiplication rule, we therefore conclude that only $1/10 \times 1/5$ or 0.02 have both.
 
@@ -425,7 +425,7 @@ monty_hall <- function(strategy){
   show <- sample(doors[!doors %in% c(my_pick, prize_door)],1)
   stick <- my_pick
   stick == prize_door
-  switch <- doors[!doors%in%c(my_pick, show)]
+  switch <- doors[!doors %in% c(my_pick, show)]
   choice <- ifelse(strategy == "stick", stick, switch)
   choice == prize_door
 }
@@ -435,7 +435,7 @@ switch <- replicate(B, monty_hall("switch"))
 mean(switch)
 ```
 
-FIX CHECK PARAGRAPH As we write the code, we see that the lines starting with `my_pick` and `show` have no influence on the last logical operation, when we stick to our original choice. From this, we should realize that the chance is 1 in 3, as we initially considered. When we switch, the Monte Carlo estimate confirms the 2/3 calculation. This helps us gain some insight by demonstrating that we are removing a door, `show`, that is definitely not a winner from our choices. We also see that unless we get it right when we first pick, we win: 1 - 1/3 = 2/3.
+As we write the code, we see that the lines starting with `my_pick` and `show` have no influence on the last logical operation, when we stick to our original choice. From this, we should realize that the chance is 1 in 3, as we initially considered. When we switch, the Monte Carlo estimate confirms the 2/3 calculation. This helps us gain some insight by demonstrating that we are removing a door, `show`, that is definitely not a winner from our choices. We also see that unless we get it right when we first pick, we win 1 - 1/3 = 2/3 of the times.
 
 ### Birthday problem
 
@@ -522,7 +522,7 @@ We can write a function that does this for any number:
 
 ```{r birthday-problem-exact-probabilities}
 exact_prob <- function(n){
-  prob_unique <- seq(365,365 - n + 1)/365 
+  prob_unique <- seq(365, 365 - n + 1)/365 
   1 - prod( prob_unique)
 }
 eprob <- sapply(n, exact_prob)
@@ -533,13 +533,15 @@ This plot shows that the Monte Carlo simulation provided a very good estimate of
 
 ## Infinity in practice
 
-FIX CHECK PARAGRAPH The theory described here requires repeating experiments over and over indefinitely. In practice, we can't do this. In the examples above, we used $B=10,000$ Monte Carlo experiments, yielding accurate estimates. The larger this number, the more accurate the estimate becomes, until the approximation is so good that your computer can't tell the difference. However, in more complex calculations, 10,000 may not be nearly enough. Moreover, for some calculations, 10,000 experiments might not be computationally feasible. In practical scenarios, we won't know what the answer is beforehand, so we won't know if our Monte Carlo estimate is accurate. We know that the larger the $B$, the better the approximation. But how large do we need it to be? This is actually a challenging question, and answering it often requires advanced theoretical statistics training.
+The theory described here requires repeating experiments over and over indefinitely. In practice, we can't do this. In the examples above, we used $B=10,000$ Monte Carlo experiments, yielding accurate estimates. The larger this number, the more accurate the estimate becomes, until the approximation is so good that your computer can't tell the difference. However, in more complex calculations, 10,000 may not be nearly enough. Moreover, for some calculations, 10,000 experiments might not be computationally feasible. 
 
-One practical approach we will describe here is to check for the stability of the estimate. The following example illustrates the birthday problem for a group of 25 people.
+In practical scenarios, we won't know what the answer is beforehand, so we won't know if our Monte Carlo estimate is accurate. We know that the larger the $B$, the better the approximation. But how large do we need it to be? This is actually a challenging question, and answering it often requires advanced theoretical statistics training.
+
+One practical approach is to check for the stability of the estimate. The following example illustrates the birthday problem for a group of 25 people.
 
 ```{r monte-carlo-convergence}
 B <- 10^seq(1, 5, len = 100)
-compute_prob <- function(B, n=25){
+compute_prob <- function(B, n = 25){
   same_day <- replicate(B, same_birthday(n))
   mean(same_day)
 }
@@ -547,7 +549,7 @@ prob <- sapply(B, compute_prob)
 plot(log10(B), prob)
 ```
 
-In this plot, we can see that the values start to stabilize (that is, they vary less than .01) around 1000. Note that the exact probability, which is known in this case, is `r eprob[25]`.
+In this plot, we can see that the values start to stabilize at around 1000. Note that the exact probability, which is known in this case, is `r eprob[25]`.
 
 ## Exercises
 
@@ -590,7 +592,7 @@ The Celtics must win one of these 4 games.
 prob_win <- function(p){
   B <- 10000
   result <- replicate(B, {
-    b_win <- sample(c(1,0), 7, replace = TRUE, prob = c(1-p, p))
+    b_win <- sample(c(1,0), 7, replace = TRUE, prob = c(1 - p, p))
     sum(b_win)>=4
   })
   mean(result)
@@ -602,11 +604,11 @@ Use the function `sapply` to compute the probability, call it `Pr`, of winning f
 13\. Repeat the exercise above, but now keep the probability fixed at `p <- 0.75` and compute the probability for different series lengths: best of 1 game, best of 3 games, best of 5 games,... Specifically, `N <- seq(1, 25, 2)`. Hint: use the function below.
 
 ```{r, eval = FALSE}
-prob_win <- function(N, p=0.75){
+prob_win <- function(N, p = 0.75){
   B <- 10000
   result <- replicate(B, {
-    b_win <- sample(c(1,0), N, replace = TRUE, prob = c(1-p, p))
-    sum(b_win)>=(N+1)/2
+    b_win <- sample(c(1,0), N, replace = TRUE, prob = c(1 - p, p))
+    sum(b_win) >= (N+1)/2
   })
   mean(result)
 }
diff --git a/prob/intro-to-prob.qmd b/prob/intro-to-prob.qmd
index 3109822..0b57cbe 100644
--- a/prob/intro-to-prob.qmd
+++ b/prob/intro-to-prob.qmd
@@ -1,6 +1,6 @@
 # Probability {.unnumbered}
 
-In games of chance, probability has a very intuitive definition. However, this is not the case in other contexts. Today, probability theory is being used much more broadly with the word *probability* commonly used in everyday language. Google's auto-complete of "What are the chances of" give us: "having twins", "rain today", "getting struck by lightning", and "getting cancer". One of the goals of this section of the book is to help us in comprehending how probability is useful for understanding and describing real-world events when performing data analysis. FIX I TWEAKED THE END Probability theory is useful whenever our data is affected by chance in some manner. All the other sections in this book build upon probability theory. A knowledge of probability is therefore indispensable for addressing most data analysis challenges.
+In games of chance, probability has a very intuitive definition. However, this is not the case in other contexts. Today, probability theory is being used much more broadly with the word *probability* commonly used in everyday language. Google's auto-complete of "What are the chances of" give us: "having twins", "rain today", "getting struck by lightning", and "getting cancer". One of the goals of this section of the book is to help us in comprehending how probability is useful for understanding and describing real-world events when performing data analysis. Probability theory is useful whenever our data is affected by chance in some manner. All the other sections in this book build upon probability theory. A knowledge of probability is therefore indispensable for addressing most data analysis challenges.
 
 Given that knowing how to compute probabilities gives strategic advantage in games of chance, many smart individuals throughout history, including famous mathematicians such as Cardano, Fermat, and Pascal, spent time and energy thinking through the math of these games. As a result, Probability Theory was born. Probability continues to be highly useful in modern games of chance. For example, in poker, we can compute the probability of winning a hand based on the cards on the table. Additionally, casinos depend on probability theory to develop games that almost certainly guarantee a profit. We will use casino games to illustrate the fundamental concepts. 
 
diff --git a/prob/random-variables-sampling-models-clt.qmd b/prob/random-variables-sampling-models-clt.qmd
index 262c73e..b2df190 100644
--- a/prob/random-variables-sampling-models-clt.qmd
+++ b/prob/random-variables-sampling-models-clt.qmd
@@ -10,7 +10,7 @@ In this chapter, we introduce random variables and their properties, starting wi
 
 ## Random variables
 
-Random variables are numeric outcomes resulting from random processes. We can easily generate random variables using some of the simple examples we have shown. FIX CHECK OK For example, define `X` to be 1 if a bead is blue and red otherwise:
+Random variables are numeric outcomes resulting from random processes. We can easily generate random variables using some of the simple examples we have shown. For example, define `X` to be 1 if a bead is blue and red otherwise:
 
 ```{r,echo=FALSE, message=FALSE, warning=FALSE}
 library(tidyverse)
@@ -56,7 +56,7 @@ X[1:10]
 Because we know the proportions of 1s and -1s, we can generate the draws with one line of code, without defining `color`:
 
 ```{r}
-X <- sample(c(-1,1), n, replace = TRUE, prob=c(9/19, 10/19))
+X <- sample(c(-1, 1), n, replace = TRUE, prob = c(9/19, 10/19))
 ```
 
 We call this a **sampling model**, as it involves modeling the random behavior of roulette through the sampling of draws from an urn. The total winnings $S$ is simply the sum of these 1,000 independent draws:
@@ -73,13 +73,13 @@ If you run the code above, you see that $S$ changes every time. This is, of cour
 
 Keep in mind that if we can define a cumulative distribution function $F(a) = \mbox{Pr}(S\leq a)$, then we will be able to answer any question related to the probability of events defined by our random variable $S$, including the event $S<0$. We call this $F$ the random variable's *distribution function*.
 
-We can estimate the distribution function for the random variable $S$ by using a Monte Carlo simulation to generate many realizations of the random variable. FIX OK? With this code, we run the experiment of having 1,000 people repeatedly play roulette, specifically $B = 10,000$ times:
+We can estimate the distribution function for the random variable $S$ by using a Monte Carlo simulation to generate many realizations of the random variable. With this code, we run the experiment of having 1,000 people repeatedly play roulette, specifically $B = 10,000$ times:
 
 ```{r}
 n <- 1000
 B <- 10000
 roulette_winnings <- function(n){
-  X <- sample(c(-1,1), n, replace = TRUE, prob = c(9/19, 10/19))
+  X <- sample(c(-1, 1), n, replace = TRUE, prob = c(9/19, 10/19))
   sum(X)
 }
 S <- replicate(B, roulette_winnings(n))
@@ -112,7 +112,7 @@ We see that the distribution appears to be approximately normal. A qqplot will c
 
 This average and this standard deviation have special names; they are referred to as the *expected value* and *standard error* of the random variable $S$. More details on these concepts will be provided in the next section.
 
-FIX CHECK PARA Statistical theory offers a method to derive the distribution of random variables defined as independent random draws from an urn. Specifically, in our example above, we can demonstrate that $(S+n)/2$ follows a binomial distribution. We therefore do not need to run Monte Carlo simulations to determine the probability distribution of $S$. The simulations were conducted for illustrative purposes.
+Statistical theory offers a method to derive the distribution of random variables defined as the sum of independent random draw of numbers from an urn. Specifically, in our example above, we can demonstrate that $(S+n)/2$ follows a binomial distribution. We therefore do not need to run Monte Carlo simulations to determine the probability distribution of $S$. The simulations were conducted for illustrative purposes.
 
 We can use the function `dbinom` and `pbinom` to compute the probabilities exactly. For example, to compute $\mbox{Pr}(S < 0)$, we note that:
 
@@ -139,11 +139,11 @@ We do not delve into these details here. Instead, we will explore an incredibly
 
 ## Distributions versus probability distributions
 
-Before we continue, let's establish an important distinction and connection between the distribution of a list of numbers and a probability distribution. FIX CHAPTER OR SECTION? In the visualization chapter, we described how any list of numbers $x_1,\dots,x_n$ has a distribution. The definition is quite straightforward. We define $F(a)$ as the function that indicates what proportion of the list is less than or equal to $a$. Given their usefulness as summaries when the distribution is approximately normal, we also define the average and standard deviation. These are determined with a straightforward operation involving the vector containing the list of numbers, denoted as `x`:
+Before we continue, let's establish an important distinction and connection between the distribution of a list of numbers and a probability distribution. Any list of numbers $x_1,\dots,x_n$ has a distribution. The definition is quite straightforward. We define $F(a)$ as the function that indicates what proportion of the list is less than or equal to $a$. Given their usefulness as summaries when the distribution is approximately normal, we also define the average and standard deviation. These are determined with a straightforward operation involving the vector containing the list of numbers, denoted as `x`:
 
 ```{r, eval=FALSE}
 m <- sum(x)/length(x)
-s <- sqrt(sum((x - m)^2) / length(x))
+s <- sqrt(sum((x - m)^2)/length(x))
 ```
 
 A random variable $X$ has a distribution function. To define this, we do not need a list of numbers; it is a theoretical concept. In this case, we define the distribution as the $F(a)$ that answers the question: What is the probability that $X$ is less than or equal to $a$? There is no list of numbers.
@@ -154,7 +154,9 @@ Another way to think about it without involving an urn is by running a Monte Car
 
 ## Notation for random variables
 
-In statistical textbooks, upper case letters denote random variables, and we will adhere this convention. Lower case letters are used for observed values. You will see some notation that include both. For example, you will see events defined as $X \leq x$. Here $X$ is a random variable, making it a random event, and $x$ is an arbitrary value and not random. So, for example, $X$ might represent the number on a die roll and $x$ will represent an actual value we see: 1, 2, 3, 4, 5, or 6. In this case, the probability of $X=x$ is 1/6 regardless of the observed value $x$. This notation may seem a bit strange because when we inquire about probability, $X$ is not an observed quantity; it's a random quantity that we will encounter in the future. We can discuss what we expect it to be, what values are probable, FIX "BUT NOT ITS EXACT VALUE" but not what it is. Once we have data, we do see a realization of $X$. Therefore, data scientists often speak of what could have been after observing what actually happened.
+In statistical textbooks, upper case letters denote random variables, and we will adhere to this convention. Lower case letters are used for observed values. You will see some notation that include both. For example, you will see events defined as $X \leq x$. Here $X$ is a random variable and $x$ is an arbitrary value and not random. So, for example, $X$ might represent the number on a die roll and $x$ will represent an actual value we see: 1, 2, 3, 4, 5, or 6. In this case, the probability of $X=x$ is 1/6 regardless of the observed value $x$. 
+
+This notation may seem a bit strange because when we inquire about probability, $X$ is not an observed quantity; it's a random quantity that we will encounter in the future. We can discuss what we expect $X$ to be, what values are probable, but we can't discuss what value $X$ is. Once we have data, we do see a realization of $X$. Therefore, data analysits often speak of what could have been after observing what actually happened.
 
 ## The expected value and standard error
 
@@ -166,7 +168,7 @@ $$\mbox{E}[X]$$
 
 to denote the expected value of the random variable $X$.
 
-FIX PARA A random variable will vary around its expected value in a manner such that if you take the average of many, many draws, the average will approximate the expected value. This approximation improves getting as you take more draws, making the expected value a useful quantity to compute.
+A random variable will vary around its expected value in a manner that if you take the average of many, many draws, the average will approximate the expected value. This approximation improves as you take more draws, making the expected value a useful quantity to compute.
 
 For discrete random variable with possible outcomes $x_1,\dots,x_n$, the expected value is defined as:
 
@@ -241,7 +243,7 @@ $$\mid b - a \mid \sqrt{p(1-p)}.$$
 So in our roulette example, the standard deviation of the values inside the urn is $\mid 1 - (-1) \mid \sqrt{10/19 \times 9/19}$ or:
 
 ```{r}
-2 * sqrt(90)/19
+2*sqrt(90)/19
 ```
 
 The standard error tells us the typical difference between a random variable and its expectation. Since one draw is obviously the sum of just one draw, we can use the formula above to calculate that the random variable defined by one draw has an expected value of 0.05 and a standard error of about 1. This makes sense since we obtain either 1 or -1, with 1 slightly favored over -1.
@@ -257,10 +259,10 @@ Using this formula, the sum of 1,000 people playing has standard error of about
 
 ```{r}
 n <- 1000
-sqrt(n) * 2 * sqrt(90)/19
+sqrt(n)*2*sqrt(90)/19
 ```
 
-As a result, when 1,000 people bet on red, the casino is expected to win \$50 with a standard error of \$32. FIX ADD TO INSTALL ROULETTE WHEELS? It therefore seems like a safe bet. But we still haven't answered the question: How likely is the casino to lose money? The CLT will help in this regard.
+As a result, when 1,000 people bet on red, the casino is expected to win \$50 with a standard error of \$32. It therefore seems like a safe bet to install more roulette wheels. But we still haven't answered the question: How likely is the casino to lose money? The CLT will help in this regard.
 
 :::{.callout-note}
 The exact probability for the casino winnings can be computed precisely, rather than approximately, using the binomial distribution. However, here we focus on the CLT, which can be applied more broadly to sums of random variables in a way that the binomial distribution cannot.
@@ -278,7 +280,7 @@ We previously ran this Monte Carlo simulation:
 n <- 1000
 B <- 10000
 roulette_winnings <- function(n){
-  X <- sample(c(-1,1), n, replace = TRUE, prob = c(9/19, 10/19))
+  X <- sample(c(-1, 1), n, replace = TRUE, prob = c(9/19, 10/19))
   sum(X)
 }
 S <- replicate(B, roulette_winnings(n))
@@ -301,8 +303,8 @@ sd(S)
 Using the CLT, we can skip the Monte Carlo simulation and instead compute the probability of the casino losing money using this approximation:
 
 ```{r}
-mu <- n * (20 - 18)/38
-se <-  sqrt(n)*2*sqrt(90)/19 
+mu <- n*(20 - 18)/38
+se <- sqrt(n)*2*sqrt(90)/19 
 pnorm(0, mu, se)
 ```
 
@@ -332,7 +334,7 @@ $$
 \mbox{E}[X_1+X_2+\dots+X_n] =  \mbox{E}[X_1] + \mbox{E}[X_2]+\dots+\mbox{E}[X_n]
 $$
 
-If $X$ represents independent draws from the urn, then they all have the same expected value. FIX Let's call it $\mu$ and thus:
+If $X$ represents independent draws from the urn, then they all have the same expected value. Let's denote the expected value with $\mu$ and rewrite the equation as:
 
 $$ 
 \mbox{E}[X_1+X_2+\dots+X_n]=  n\mu
@@ -414,7 +416,7 @@ The law of averages is sometimes misinterpreted. For example, if you toss a coin
 2\. The payout for winning on green is \$17 dollars. This means that if you bet a dollar and it lands on green, you get \$17. Create a sampling model using sample to simulate the random variable $X$ for your winnings. Hint: Refer to the example below for how it should look like when betting on red.
 
 ```{r}
-x <- sample(c(1,-1), 1, prob = c(9/19, 10/19))
+x <- sample(c(1, -1), 1, prob = c(9/19, 10/19))
 ```
 
 3\. Compute the expected value of $X$.
@@ -461,10 +463,10 @@ d.  It is not closer. The difference is within rounding error.
 
 19\. More complex versions of the sampling models we have discussed are also used by banks to determine interest rates and insurance companies to determine premiums. To understand this, suppose you run a small bank that has a history of identifying potential homeowners that can be trusted to make payments. In fact, historically, only 2% of your customers default in a given year, meaning that they don't pay back the money that you lent them. Suppose your bank will give out $n=$1,000 loans for \$180,000 this year. Also, after adding up all costs, suppose your bank loses $l$=\$200,000 per foreclosure. For simplicity, we assume this includes all operational costs. What is the expected profit $S$ for you bank under this scenario?
 
-20\. Note that the total loss defined by the final sum in the previous exercise is a random variable. FIX Every time you run the sampling model code, you obtain a different number of people default resulting in a different loss. Code a sampling model for the random variable representing your banks profit $S$ under scenario described in 19.
+20\. Note that the total loss defined by the final sum in the previous exercise is a random variable. Every time you run the sampling model code, you obtain a different number of people defaulting which results in a different loss. Code a sampling model for the random variable representing your banks profit $S$ under scenario described in 19.
 
 
-21\. The previous exercise demonstrates that if you simply loan money to everybody without interest, you will end up losing money due to the 2% that defaults. Although you know 2% of your clients will probably default, you don't know which ones, so you can't remove them. Yet by charging everybody just a bit extra in interest, you can make up the losses incurred due to that 2%, and also cover your operating costs. What quantity $x$ would you have to charge each borrower so that your bank's expected profit is 0? Assume that you don't get $x$ from the borrowers that default. Also, note $x$ is not the interest rate, but the total you add. FIX IS THIS A QUESTION Can we refer to $x$ divided by the size ($x/180000$) as the _interest rate_.
+21\. The previous exercise demonstrates that if you simply loan money to everybody without interest, you will end up losing money due to the 2% that defaults. Although you know 2% of your clients will probably default, you don't know which ones, so you can't remove them. Yet by charging everybody just a bit extra in interest, you can make up the losses incurred due to that 2%, and also cover your operating costs. What quantity $x$ would you have to charge each borrower so that your bank's expected profit is 0? Assume that you don't get $x$ from the borrowers that default. Also, note $x$ is not the interest rate, but the total you add meaning $x/180000$ is the _interest rate_.
 
 
 22\. Rewrite the sample model from exercise 20 and run a Monte Carlo simulation to get an idea of the distribution of your profit when you charge interest rates.
@@ -494,7 +496,7 @@ d.  It is not closer. The difference is within rounding error.
 
 Your colleague's scheme was mainly based on this mathematical formula $\mbox{SE}\left(\bar{X}\right) = \sigma / \sqrt{n}$. By making $n$ large, we minimize the standard error of our per-loan profit. However, for this rule to hold, the $X$s must be independent draws: one person defaulting must be independent of others defaulting. 
 
-FIX CHECK PARA To construct a more realistic simulation than the original one your colleague ran, let's assume there is a global event affecting everybody with high-risk mortgages and altering their probability simultaneously. We will assume, with 50-50 chance, that all the probabilities slightly increase or decrease to somewhere between 0.03 and 0.05. However, this change occurs universally, impacting everybody at once, not just one person. As these draws are no longer independent, our equation does not apply. Write a Monte Carlo simulation for your total profit with this model.
+To construct a more realistic simulation than the original one your colleague ran, let's assume there is a global event affecting everybody with high-risk mortgages and altering their probability simultaneously. We will assume that with a 50-50 chance all the default probabilities slightly increase or decrease to somewhere between 0.03 and 0.05. However, this change occurs universally, impacting everybody at once, not just one person. As these draws are no longer independent, our equation for the standard error of the sum of random varaibles does not apply. Write a Monte Carlo simulation for your total profit with this model.
 
 32\. Use the simulation results to report the expected profit, the probability of losing money, and the probability of losing more than $10,000,000. Study the distribution of profit and discuss how making the wrong assumption lead to a catastrophic result.
 
diff --git a/summaries/distributions.qmd b/summaries/distributions.qmd
index e6d8ce4..b0faced 100644
--- a/summaries/distributions.qmd
+++ b/summaries/distributions.qmd
@@ -275,7 +275,7 @@ To see how many men are within 2 SDs from the average, we simply type:
 mean(abs(z) < 2)
 ```
 
-The proportion is about 95%, which is what the normal distribution predicts! FIX CHECK THIS I TWEAKED To further validate this approximation, we can use quantile-quantile plots.
+The proportion is about 95%, which is what the normal distribution predicts! To further validate this approximation, we can use quantile-quantile plots.
 
 ## Quantile-quantile plots
 
@@ -389,7 +389,7 @@ In this instance, the histogram above or a smooth density plot would serve as a
 
 Now suppose those used to receiving just two numbers as summaries ask us for a more compact numerical summary.
 
-The boxplot provides a five-number summary composed of the range along with the quartiles (the 25th, 50th, and 75th percentiles). The boxplot often ignores *outliers* when computing the range and instead plots these as independent points. We will provide a detailed explanation of outliers later. FIX WHO IS "HE"? Finally, he suggested we plot these numbers as a "box" with "whiskers" like this:
+The boxplot provides a five-number summary composed of the range along with the quartiles (the 25th, 50th, and 75th percentiles). The boxplot often ignores *outliers* when computing the range and instead plots these as independent points. We will provide a detailed explanation of outliers later.  Finally, we plot these numbers as a "box" with "whiskers" like this:
 
 ```{r summaries-first-boxplot, echo = FALSE}
 murders |> ggplot(aes("",rate)) + geom_boxplot() +
diff --git a/summaries/robust-summaries.qmd b/summaries/robust-summaries.qmd
index 9da9351..9d06fb7 100644
--- a/summaries/robust-summaries.qmd
+++ b/summaries/robust-summaries.qmd
@@ -170,9 +170,8 @@ reported_heights <- reported_heights |>
 ```
 
 we get a warnings about NAs. Examine the rows that result in NAs and describe why this is happening.
- FIX Others used centimeters and others were just trolling. 
  
-13\. FIX Remove these entries the result in NAs after attempting to convert heights to numbers. Compute the mean, standard deviation, median, and MAD by sex. What do you notice?
+13\. Remove the entries that result in NAs when attempting to convert heights to numbers. Compute the mean, standard deviation, median, and MAD by sex. What do you notice?
 
 14\. Generate boxplots summarizing the heights for males and females and describe what you see.