diff --git a/latex-math/basic-math.tex b/latex-math/basic-math.tex index cf76ff4ad..4acb50e54 100644 --- a/latex-math/basic-math.tex +++ b/latex-math/basic-math.tex @@ -1,3 +1,4 @@ +% dependencies: amsmath, amssymb, dsfont % math spaces \ifdefined\N \renewcommand{\N}{\mathds{N}} % N, naturals @@ -6,7 +7,7 @@ \newcommand{\Q}{\mathds{Q}} % Q, rationals \newcommand{\R}{\mathds{R}} % R, reals \ifdefined\C - \renewcommand{\C}{\mathds{C}} % C, complex +\renewcommand{\C}{\mathds{C}} % C, complex \else \newcommand{\C}{\mathds{C}} \fi \newcommand{\continuous}{\mathcal{C}} % C, space of continuous functions \newcommand{\M}{\mathcal{M}} % machine numbers @@ -49,9 +50,9 @@ \newcommand{\prodjp}{\prod\limits_{j=1}^p} % product from j=1 to p % linear algebra -\newcommand{\one}{\boldsymbol{1}} % 1, unitvector +\newcommand{\one}{\bm{1}} % 1, unitvector \newcommand{\zero}{\mathbf{0}} % 0-vector -\newcommand{\id}{\boldsymbol{I}} % I, identity +\newcommand{\id}{\bm{I}} % I, identity \newcommand{\diag}{\operatorname{diag}} % diag, diagonal \newcommand{\trace}{\operatorname{tr}} % tr, trace \newcommand{\spn}{\operatorname{span}} % span diff --git a/latex-math/basic-ml.tex b/latex-math/basic-ml.tex index eff539e6f..c250ef6b7 100644 --- a/latex-math/basic-ml.tex +++ b/latex-math/basic-ml.tex @@ -1,7 +1,7 @@ % machine learning \newcommand{\Xspace}{\mathcal{X}} % X, input space \newcommand{\Yspace}{\mathcal{Y}} % Y, output space -\newcommand{\Zspace}{\mathcal{Z}} % Space of sampled datapoints ! Also defined identically in ml-online.tex ! +\newcommand{\Zspace}{\mathcal{Z}} % Z, space of sampled datapoints \newcommand{\nset}{\{1, \ldots, n\}} % set from 1 to n \newcommand{\pset}{\{1, \ldots, p\}} % set from 1 to p \newcommand{\gset}{\{1, \ldots, g\}} % set from 1 to g @@ -26,6 +26,7 @@ \newcommand{\xdat}{\left\{ \xv^{(1)}, \ldots, \xv^{(n)}\right\}} % {x1, ..., xn}, input data \newcommand{\ydat}{\left\{ \yv^{(1)}, \ldots, \yv^{(n)}\right\}} % {y1, ..., yn}, input data \newcommand{\yvec}{\left(y^{(1)}, \hdots, y^{(n)}\right)^\top} % (y1, ..., yn), vector of outcomes +\newcommand{\greekxi}{\xi} % Greek letter xi \renewcommand{\xi}[1][i]{\xv^{(#1)}} % x^i, i-th observed value of x \newcommand{\yi}[1][i]{y^{(#1)}} % y^i, i-th observed value of y \newcommand{\xivec}{\left(x^{(i)}_1, \ldots, x^{(i)}_p\right)^\top} % (x1^i, ..., xp^i), i-th observation vector @@ -54,10 +55,10 @@ \newcommand{\fkx}[1][k]{f_{#1}(\xv)} % f_j(x), discriminant component function \newcommand{\fh}{\hat{f}} % f hat, estimated prediction function \newcommand{\fxh}{\fh(\xv)} % fhat(x) -\newcommand{\fxt}{f(\xv ~|~ \thetab)} % f(x | theta) +\newcommand{\fxt}{f(\xv ~|~ \thetav)} % f(x | theta) \newcommand{\fxi}{f\left(\xv^{(i)}\right)} % f(x^(i)) \newcommand{\fxih}{\hat{f}\left(\xv^{(i)}\right)} % f(x^(i)) -\newcommand{\fxit}{f\left(\xv^{(i)} ~|~ \thetab\right)} % f(x^(i) | theta) +\newcommand{\fxit}{f\left(\xv^{(i)} ~|~ \thetav\right)} % f(x^(i) | theta) \newcommand{\fhD}{\fh_{\D}} % fhat_D, estimate of f based on D \newcommand{\fhDtrain}{\fh_{\Dtrain}} % fhat_Dtrain, estimate of f based on D \newcommand{\fhDnlam}{\fh_{\Dn, \lamv}} %model learned on Dn with hp lambda @@ -69,9 +70,9 @@ \newcommand{\hx}{h(\xv)} % h(x), discrete prediction function \newcommand{\hh}{\hat{h}} % h hat \newcommand{\hxh}{\hat{h}(\xv)} % hhat(x) -\newcommand{\hxt}{h(\xv | \thetab)} % h(x | theta) +\newcommand{\hxt}{h(\xv | \thetav)} % h(x | theta) \newcommand{\hxi}{h\left(\xi\right)} % h(x^(i)) -\newcommand{\hxit}{h\left(\xi ~|~ \thetab\right)} % h(x^(i) | theta) +\newcommand{\hxit}{h\left(\xi ~|~ \thetav\right)} % h(x^(i) | theta) \newcommand{\hbayes}{h^{\ast}} % Bayes-optimal classification model \newcommand{\hxbayes}{h^{\ast}(\xv)} % Bayes-optimal classification model @@ -82,27 +83,27 @@ % theta \newcommand{\thetah}{\hat{\theta}} % theta hat -\newcommand{\thetab}{\bm{\theta}} % theta vector -\newcommand{\thetabh}{\bm{\hat\theta}} % theta vector hat -\newcommand{\thetat}[1][t]{\thetab^{[#1]}} % theta^[t] in optimization -\newcommand{\thetatn}[1][t]{\thetab^{[#1 +1]}} % theta^[t+1] in optimization -\newcommand{\thetahDnlam}{\thetabh_{\Dn, \lamv}} %theta learned on Dn with hp lambda -\newcommand{\thetahDlam}{\thetabh_{\D, \lamv}} %theta learned on D with hp lambda -\newcommand{\mint}{\min_{\thetab \in \Theta}} % min problem theta -\newcommand{\argmint}{\argmin_{\thetab \in \Theta}} % argmin theta +\newcommand{\thetav}{\bm{\theta}} % theta vector +\newcommand{\thetavh}{\bm{\hat\theta}} % theta vector hat +\newcommand{\thetat}[1][t]{\thetav^{[#1]}} % theta^[t] in optimization +\newcommand{\thetatn}[1][t]{\thetav^{[#1 +1]}} % theta^[t+1] in optimization +\newcommand{\thetahDnlam}{\thetavh_{\Dn, \lamv}} %theta learned on Dn with hp lambda +\newcommand{\thetahDlam}{\thetavh_{\D, \lamv}} %theta learned on D with hp lambda +\newcommand{\mint}{\min_{\thetav \in \Theta}} % min problem theta +\newcommand{\argmint}{\argmin_{\thetav \in \Theta}} % argmin theta % densities + probabilities % pdf of x \newcommand{\pdf}{p} % p \newcommand{\pdfx}{p(\xv)} % p(x) -\newcommand{\pixt}{\pi(\xv~|~ \thetab)} % pi(x|theta), pdf of x given theta -\newcommand{\pixit}[1][i]{\pi\left(\xi[#1] ~|~ \thetab\right)} % pi(x^i|theta), pdf of x given theta +\newcommand{\pixt}{\pi(\xv~|~ \thetav)} % pi(x|theta), pdf of x given theta +\newcommand{\pixit}[1][i]{\pi\left(\xi[#1] ~|~ \thetav\right)} % pi(x^i|theta), pdf of x given theta \newcommand{\pixii}[1][i]{\pi\left(\xi[#1]\right)} % pi(x^i), pdf of i-th x % pdf of (x, y) \newcommand{\pdfxy}{p(\xv,y)} % p(x, y) -\newcommand{\pdfxyt}{p(\xv, y ~|~ \thetab)} % p(x, y | theta) -\newcommand{\pdfxyit}{p\left(\xi, \yi ~|~ \thetab\right)} % p(x^(i), y^(i) | theta) +\newcommand{\pdfxyt}{p(\xv, y ~|~ \thetav)} % p(x, y | theta) +\newcommand{\pdfxyit}{p\left(\xi, \yi ~|~ \thetav\right)} % p(x^(i), y^(i) | theta) % pdf of x given y \newcommand{\pdfxyk}[1][k]{p(\xv | y= #1)} % p(x | y = k) @@ -112,7 +113,7 @@ % prior probabilities \newcommand{\pik}[1][k]{\pi_{#1}} % pi_k, prior \newcommand{\lpik}[1][k]{\log \pi_{#1}} % log pi_k, log of the prior -\newcommand{\pit}{\pi(\thetab)} % Prior probability of parameter theta +\newcommand{\pit}{\pi(\thetav)} % Prior probability of parameter theta % posterior probabilities \newcommand{\post}{\P(y = 1 ~|~ \xv)} % P(y = 1 | x), post. prob for y=1 @@ -123,13 +124,13 @@ \newcommand{\pix}{\pi(\xv)} % pi(x), P(y = 1 | x) \newcommand{\piv}{\bm{\pi}} % pi, bold, as vector \newcommand{\pikx}[1][k]{\pi_{#1}(\xv)} % pi_k(x), P(y = k | x) -\newcommand{\pikxt}[1][k]{\pi_{#1}(\xv ~|~ \thetab)} % pi_k(x | theta), P(y = k | x, theta) +\newcommand{\pikxt}[1][k]{\pi_{#1}(\xv ~|~ \thetav)} % pi_k(x | theta), P(y = k | x, theta) \newcommand{\pixh}{\hat \pi(\xv)} % pi(x) hat, P(y = 1 | x) hat \newcommand{\pikxh}[1][k]{\hat \pi_{#1}(\xv)} % pi_k(x) hat, P(y = k | x) hat \newcommand{\pixih}{\hat \pi(\xi)} % pi(x^(i)) with hat \newcommand{\pikxih}[1][k]{\hat \pi_{#1}(\xi)} % pi_k(x^(i)) with hat -\newcommand{\pdfygxt}{p(y ~|~\xv, \thetab)} % p(y | x, theta) -\newcommand{\pdfyigxit}{p\left(\yi ~|~\xi, \thetab\right)} % p(y^i |x^i, theta) +\newcommand{\pdfygxt}{p(y ~|~\xv, \thetav)} % p(y | x, theta) +\newcommand{\pdfyigxit}{p\left(\yi ~|~\xi, \thetav\right)} % p(y^i |x^i, theta) \newcommand{\lpdfygxt}{\log \pdfygxt } % log p(y | x, theta) \newcommand{\lpdfyigxit}{\log \pdfyigxit} % log p(y^i |x^i, theta) @@ -139,8 +140,10 @@ % residual and margin \newcommand{\eps}{\epsilon} % residual, stochastic +\newcommand{\epsv}{\bm{\epsilon}} % residual, stochastic, as vector \newcommand{\epsi}{\epsilon^{(i)}} % epsilon^i, residual, stochastic \newcommand{\epsh}{\hat{\epsilon}} % residual, estimated +\newcommand{\epsvh}{\hat{\epsv}} % residual, estimated, vector \newcommand{\yf}{y \fx} % y f(x), margin \newcommand{\yfi}{\yi \fxi} % y^i f(x^i), margin \newcommand{\Sigmah}{\hat \Sigma} % estimated covariance matrix @@ -153,7 +156,7 @@ \newcommand{\Lxyi}{L\left(\yi, \fxi\right)} % loss of observation \newcommand{\Lxyt}{L\left(y, \fxt\right)} % loss with f parameterized \newcommand{\Lxyit}{L\left(\yi, \fxit\right)} % loss of observation with f parameterized -\newcommand{\Lxym}{L\left(\yi, f\left(\bm{\tilde{x}}^{(i)} ~|~ \thetab\right)\right)} % loss of observation with f parameterized +\newcommand{\Lxym}{L\left(\yi, f\left(\bm{\tilde{x}}^{(i)} ~|~ \thetav\right)\right)} % loss of observation with f parameterized \newcommand{\Lpixy}{L\left(y, \pix\right)} % loss in classification \newcommand{\Lpiv}{L\left(y, \piv\right)} % loss in classification \newcommand{\Lpixyi}{L\left(\yi, \pixii\right)} % loss of observation in classification @@ -171,26 +174,26 @@ \newcommand{\riskbayes}{\mathcal{R}^\ast} \newcommand{\riskf}{\risk(f)} % R(f), risk \newcommand{\riskdef}{\E_{y|\xv}\left(\Lxy \right)} % risk def (expected loss) -\newcommand{\riskt}{\mathcal{R}(\thetab)} % R(theta), risk +\newcommand{\riskt}{\mathcal{R}(\thetav)} % R(theta), risk \newcommand{\riske}{\mathcal{R}_{\text{emp}}} % R_emp, empirical risk w/o factor 1 / n \newcommand{\riskeb}{\bar{\mathcal{R}}_{\text{emp}}} % R_emp, empirical risk w/ factor 1 / n \newcommand{\riskef}{\riske(f)} % R_emp(f) -\newcommand{\risket}{\mathcal{R}_{\text{emp}}(\thetab)} % R_emp(theta) +\newcommand{\risket}{\mathcal{R}_{\text{emp}}(\thetav)} % R_emp(theta) \newcommand{\riskr}{\mathcal{R}_{\text{reg}}} % R_reg, regularized risk -\newcommand{\riskrt}{\mathcal{R}_{\text{reg}}(\thetab)} % R_reg(theta) +\newcommand{\riskrt}{\mathcal{R}_{\text{reg}}(\thetav)} % R_reg(theta) \newcommand{\riskrf}{\riskr(f)} % R_reg(f) -\newcommand{\riskrth}{\hat{\mathcal{R}}_{\text{reg}}(\thetab)} % hat R_reg(theta) -\newcommand{\risketh}{\hat{\mathcal{R}}_{\text{emp}}(\thetab)} % hat R_emp(theta) +\newcommand{\riskrth}{\hat{\mathcal{R}}_{\text{reg}}(\thetav)} % hat R_reg(theta) +\newcommand{\risketh}{\hat{\mathcal{R}}_{\text{emp}}(\thetav)} % hat R_emp(theta) \newcommand{\LL}{\mathcal{L}} % L, likelihood -\newcommand{\LLt}{\mathcal{L}(\thetab)} % L(theta), likelihood -\newcommand{\LLtx}{\mathcal{L}(\thetab | \xv)} % L(theta|x), likelihood +\newcommand{\LLt}{\mathcal{L}(\thetav)} % L(theta), likelihood +\newcommand{\LLtx}{\mathcal{L}(\thetav | \xv)} % L(theta|x), likelihood \newcommand{\logl}{\ell} % l, log-likelihood -\newcommand{\loglt}{\logl(\thetab)} % l(theta), log-likelihood -\newcommand{\logltx}{\logl(\thetab | \xv)} % l(theta|x), log-likelihood +\newcommand{\loglt}{\logl(\thetav)} % l(theta), log-likelihood +\newcommand{\logltx}{\logl(\thetav | \xv)} % l(theta|x), log-likelihood \newcommand{\errtrain}{\text{err}_{\text{train}}} % training error \newcommand{\errtest}{\text{err}_{\text{test}}} % test error \newcommand{\errexp}{\overline{\text{err}_{\text{test}}}} % avg training error % lm -\newcommand{\thx}{\thetab^\top \xv} % linear model +\newcommand{\thx}{\thetav^\top \xv} % linear model \newcommand{\olsest}{(\Xmat^\top \Xmat)^{-1} \Xmat^\top \yv} % OLS estimator in LM diff --git a/latex-math/ml-ensembles.tex b/latex-math/ml-ensembles.tex index 5c58d3b50..6d6f3fa99 100644 --- a/latex-math/ml-ensembles.tex +++ b/latex-math/ml-ensembles.tex @@ -17,9 +17,9 @@ \newcommand{\errm}[1][m]{\text{err}^{[#1]}} % weighted in-sample misclassification rate \newcommand{\wm}[1][m]{w^{[#1]}} % weight vector of basemodel m \newcommand{\wmi}[1][m]{w^{[#1](i)}} % weight of obs i of basemodel m -\newcommand{\thetam}[1][m]{\thetab^{[#1]}} % parameters of basemodel m -\newcommand{\thetamh}[1][m]{\hat{\thetab}^{[#1]}} % parameters of basemodel m with hat -\newcommand{\blxt}[1][m]{b(\xv, \thetab^{[#1]})} % baselearner, default m +\newcommand{\thetam}[1][m]{\thetav^{[#1]}} % parameters of basemodel m +\newcommand{\thetamh}[1][m]{\hat{\thetav}^{[#1]}} % parameters of basemodel m with hat +\newcommand{\blxt}[1][m]{b(\xv, \thetav^{[#1]})} % baselearner, default m \newcommand{\ens}{\sum_{m=1}^M \betam \blxt} % ensemble \newcommand{\rmm}[1][m]{\tilde{r}^{[#1]}} % pseudo residuals \newcommand{\rmi}[1][m]{\tilde{r}^{[#1](i)}} % pseudo residuals @@ -33,6 +33,6 @@ \newcommand{\Lpleft}{\Lp_{\text{left}}} % ml - boosting iml lecture -\newcommand{\ts}{\thetab^{\star}} % theta* -\newcommand{\bljt}{\bl[j](\xv, \thetab)} % BL j with theta +\newcommand{\ts}{\thetav^{\star}} % theta* +\newcommand{\bljt}{\bl[j](\xv, \thetav)} % BL j with theta \newcommand{\bljts}{\bl[j](\xv, \ts)} % BL j with theta* diff --git a/latex-math/ml-eval.tex b/latex-math/ml-eval.tex index 9780f9c7e..01232669e 100644 --- a/latex-math/ml-eval.tex +++ b/latex-math/ml-eval.tex @@ -41,7 +41,7 @@ % performance measure \newcommand{\rhoL}{\rho_L} % perf. measure derived from pointwise loss -\newcommand{\F}{\boldsymbol{F}} % matrix of prediction scores +\newcommand{\F}{\bm{F}} % matrix of prediction scores \newcommand{\Fi}[1][i]{\F^{(#1)}} % i-th row vector of the predscore mat \newcommand{\FJ}[1][J]{\F_{#1}} % predscore mat idxvec J \newcommand{\FJf}{\FJ[J,f]} % predscore mat idxvec J and model f diff --git a/latex-math/ml-infotheory.tex b/latex-math/ml-infotheory.tex index c06a4423d..4ef9678bb 100644 --- a/latex-math/ml-infotheory.tex +++ b/latex-math/ml-infotheory.tex @@ -7,6 +7,6 @@ \newcommand{\cdentyx}{- \int_{\Xspace, \Yspace} f(x, y) \cdot \log f(y | x) dx dy} % cond diff entropy y|x \newcommand{\xentpq}{- \sum_{x \in \Xspace} p(x) \cdot \log q(x)} % cross-entropy of p, q \newcommand{\kldpq}{D_{KL}(p \| q)} % KLD between p and q -\newcommand{\kldpqt}{D_{KL}(p \| q_{\thetab})} % KLD divergence between p and parameterized q +\newcommand{\kldpqt}{D_{KL}(p \| q_{\thetav})} % KLD divergence between p and parameterized q \newcommand{\explogpq}{\E_p \left[\log \frac{p(X)}{q(X)} \right]} % expected LLR of p, q (def KLD) \newcommand{\sumlogpq}{\sum_{x \in \Xspace} p(x) \cdot \log \frac{p(x)}{q(x)}} % expected LLR of p, q (def KLD) diff --git a/latex-math/ml-nn.tex b/latex-math/ml-nn.tex index 6bcf3a145..f6728ba30 100644 --- a/latex-math/ml-nn.tex +++ b/latex-math/ml-nn.tex @@ -19,18 +19,18 @@ \newcommand{\Odropout}{\mathnormal{J}(\theta, \mu|X,y)} % dropout objective function % deeplearning - optimization -\newcommand{\Loss}{L(y, f(\xv, \thetab))} -\newcommand{\Lmomentumnest}{L(\yi, f(x^{(i)}, \thetab + \varphi \nub))} % momentum risk -\newcommand{\Lmomentumtilde}{L(\yi, f(x^{(i)}, \tilde{\thetab}))} % Nesterov momentum risk -\newcommand{\Lmomentum}{L(\yi, f(x^{(i)}, \thetab))} +\newcommand{\Loss}{L(y, f(\xv, \thetav))} +\newcommand{\Lmomentumnest}{L(\yi, f(x^{(i)}, \thetav + \varphi \nub))} % momentum risk +\newcommand{\Lmomentumtilde}{L(\yi, f(x^{(i)}, \tilde{\thetav}))} % Nesterov momentum risk +\newcommand{\Lmomentum}{L(\yi, f(x^{(i)}, \thetav))} \newcommand{\Hess}{\mathbf{H}} -\newcommand{\nub}{\boldsymbol{\nu}} +\newcommand{\nub}{\bm{\nu}} % deeplearning - autoencoders \newcommand{\uauto}{L(x,g(f(x)))} % undercomplete autoencoder objective function \newcommand{\dauto}{L(x,g(f(\tilde{x})))} % denoising autoencoder objective function % deeplearning - adversarials -\newcommand{\deltab}{\boldsymbol{\delta}} -\newcommand{\Lossdeltai}{L(\yi, f(\xi + \deltab|\thetab))} -\newcommand{\Lossdelta}{L(y, f(\xv + \deltab| \thetab))} +\newcommand{\deltab}{\bm{\delta}} +\newcommand{\Lossdeltai}{L(\yi, f(\xi + \deltab|\thetav))} +\newcommand{\Lossdelta}{L(y, f(\xv + \deltab| \thetav))} diff --git a/latex-math/ml-regu.tex b/latex-math/ml-regu.tex new file mode 100644 index 000000000..07559c739 --- /dev/null +++ b/latex-math/ml-regu.tex @@ -0,0 +1,6 @@ +% \thetah is \hat{\theta}} (theta hat) +% \thetav is \bm{\theta}} (theta vector) +\newcommand{\thetas}{\thetav^*} % theta star +\newcommand{\thetaridge}{\thetav_{\mathrm{ridge}}}} % theta (RIDGE) +\newcommand{\thetalasso}{\thetav_{\mathrm{LASSO}}}} % theta (LASSO) +\newcommand{\thetaols}{\thetav_{\mathrm{OLS}}}} % theta (RIDGE) diff --git a/latex-math/ml-svm.tex b/latex-math/ml-svm.tex index 4421505ab..1f4d937f0 100644 --- a/latex-math/ml-svm.tex +++ b/latex-math/ml-svm.tex @@ -3,8 +3,8 @@ \renewcommand{\sl}{\zeta} % slack variable \newcommand{\slvec}{\left(\zeta^{(1)}, \zeta^{(n)}\right)} % slack variable vector \newcommand{\sli}[1][i]{\zeta^{(#1)}} % i-th slack variable -\newcommand{\scptxi}{\scp{\thetab}{\xi}} % scalar prodct of theta and xi -\newcommand{\svmhplane}{\yi \left( \scp{\thetab}{\xi} + \theta_0 \right)} % SVM hyperplane (normalized) +\newcommand{\scptxi}{\scp{\thetav}{\xi}} % scalar prodct of theta and xi +\newcommand{\svmhplane}{\yi \left( \scp{\thetav}{\xi} + \theta_0 \right)} % SVM hyperplane (normalized) \newcommand{\alphah}{\hat{\alpha}} % alpha-hat (basis fun coefficients) \newcommand{\alphav}{\bm{\alpha}} % vector alpha (bold) (basis fun coefficients) \newcommand{\alphavh}{\hat{\bm{\alpha}}} % vector alpha-hat (basis fun coefficients) @@ -15,4 +15,4 @@ \newcommand{\phix}{\phi(\xv)} % feature map x \newcommand{\phixt}{\phi(\tilde \xv)} % feature map x tilde \newcommand{\kxxt}{k(\xv, \tilde \xv)} % kernel fun x, x tilde -\newcommand{\scptxifm}{\scp{\thetab}{\phi(\xi)}} % scalar prodct of theta and xi +\newcommand{\scptxifm}{\scp{\thetav}{\phi(\xi)}} % scalar prodct of theta and xi