/
differential.cljc
1409 lines (1239 loc) · 52.6 KB
/
differential.cljc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
;;
;; Copyright © 2020 Sam Ritchie.
;; This work is based on the Scmutils system of MIT/GNU Scheme:
;; Copyright © 2002 Massachusetts Institute of Technology
;;
;; This is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 3 of the License, or (at
;; your option) any later version.
;;
;; This software is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;; General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with this code; if not, see <http://www.gnu.org/licenses/>.
;;
(ns sicmutils.differential
"This namespace contains an implementation of [[Differential]], a generalized
dual number type that forms the basis for the forward-mode automatic
differentiation implementation in sicmutils.
See [[sicmutils.calculus.derivative]] for a fleshed-out derivative
implementation using [[Differential]]."
(:refer-clojure :rename {compare core-compare}
#?@(:cljs [:exclude [compare]]))
(:require [clojure.string :refer [join]]
[sicmutils.function :as f]
[sicmutils.generic :as g]
[sicmutils.util :as u]
[sicmutils.util.stream :as us]
[sicmutils.util.vector-set :as uv]
[sicmutils.value :as v])
#?(:clj
(:import (clojure.lang AFn IFn))))
;; Differentials, Dual Numbers and Automatic Differentiation
;;
;; This namespace develops an implementation of a type called [[Differential]].
;; A [[Differential]] is a generalization of a type called a ["dual
;; number"](https://en.wikipedia.org/wiki/Dual_number).
;;
;; As we'll discuss, passing these numbers as arguments to some function $f$
;; built out of the [[sicmutils.generic]] operators allows us to build up the
;; _derivative_ of $f$ in parallel to our evaluation of $f$. Complex programs
;; are built out of simple pieces that we know how to evaluate; we can build up
;; derivatives of entire programs in a similar way, building them out of the
;; derivatives of the smaller pieces of those programs.
;;
;; ### Forward-Mode Automatic Differentiation
;;
;; For many scientific computing applications, it's valuable be able to generate
;; a "derivative" of a function; given some wiggle in the inputs, how much
;; wobble does the output produce?
;;
;; we know how to take derivatives of many of the generic functions exposed by
;; SICMUtils, like [[+]], [[*]], [[g/sin]] and friends. It turns out that we can
;; take the derivatives of large, complicated functions by combining the
;; derivatives of these smaller functions using the [chain
;; rule]((https://en.wikipedia.org/wiki/Automatic_differentiation#The_chain_rule,_forward_and_reverse_accumulation))
;; as a clever bookkeeping device.
;;
;; The technique of evaluating a function and its derivative in parallel is
;; called "forward-mode [Automatic
;; Differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)".
;; The [SICMUtils
;; wiki](https://github.com/sicmutils/sicmutils/wiki/Automatic-Differentiation)
;; has more information on the history of this technique, and links to the many
;; other implementations you'll find in different languages. See the [cljdocs
;; Automatic Differentiation
;; page](https://cljdoc.org/d/sicmutils/sicmutils/CURRENT/doc/calculus/automatic-differentiation)
;; for "how do I use this?"-style questions.
;;
;; NOTE: The other flavor of automatic differentiation (AD) is "reverse-mode
;; AD". See [[sicmutils.tape]] for an implementation of this style, coming soon!
;;
;; ### Dual Numbers and AD
;;
;; Our goal is to build up derivatives of complex functions out of the
;; derivatives of small pieces. A [dual
;; number](https://en.wikipedia.org/wiki/Dual_number) is a relatively simple
;; piece of machinery that will help us accomplish this goal.
;; A [dual number](https://en.wikipedia.org/wiki/Dual_number) is a pair of
;; numbers of the form
;;
;; $$a + b \varepsilon$$
;;
;; where $a$ and $b$ are real numbers, and $\varepsilon$ is an abstract thing,
;; with the property that $\varepsilon^2 = 0$.
;; NOTE: This might remind you of the definition of a complex number of the form
;; $a + bi$, where $i$ is also a new thing with the property that $i^2 = -1$.
;; You are very wise! The bigger idea lurking here is the ["generalized complex
;; number"](https://people.rit.edu/harkin/research/articles/generalized_complex_numbers.pdf),
;; and of course mathematicians have pushed this very far. You might explore
;; the ["Split-complex
;; numbers"](https://en.wikipedia.org/wiki/Split-complex_number) too, which
;; arise when you set $i^2 = 1$.
;;
;; Why are dual numbers useful (in SICMUtils)? If you pass $a+b\varepsilon$ in
;; to a function $f$, the result is a dual number $f(a) + Df(a) b \varepsilon$;
;; you get both the function and its derivative evaluated at the same time!
;; To see why, look at what happens when you pass a dual number into the [Taylor
;; series expansion](https://en.wikipedia.org/wiki/Taylor_series) of some
;; arbitrary function $f$. As a reminder, the Taylor series expansion of $f$
;; around some point $a$ is:
;;
;; $$f(x) = f(a)+\frac{Df(a)}{1!}(x-a)+\frac{D^2f(a)}{2!}(x-a)^{2}+\frac{D^3f(a)}{3!}(x-a)^{3}+\cdots$$
;;
;; NOTE: See this nice overview of [Taylor series
;; expansion](https://medium.com/@andrew.chamberlain/an-easy-way-to-remember-the-taylor-series-expansion-a7c3f9101063)
;; by Andrew Chamberlain if you want to understand this idea and why we can
;; approximate (smooth) functions this way.
;;
;; If you evaluate the expansion of $f(x)$ around $a$ with a dual number
;; argument whose first component is $a$ -- take $x=a+b\varepsilon$, for example
;; -- watch how the expansion simplifies:
;;
;; $$f(a+b\varepsilon) = f(a)+\frac{Df(a)}{1!}(b\varepsilon)+\frac{D^2f(a)}{2!}(b\varepsilon)^2+\cdots$$
;;
;; Since $\varepsilon^2=0$ we can ignore all terms beyond the first two:
;;
;; $$f(a+b\varepsilon) = f(a)+ (Df(a)b)\varepsilon$$
;;
;; NOTE: See [[lift-1]] for an implementation of this idea.
;;
;; Interesting! This justifies our claim above: applying a function to some dual
;; number $a+\varepsilon$ returns a new dual number, where
;;
;; - the first component is $f(a)$, the normal function evaluation
;; - the second component is $Df(a)$, the derivative.
;;
;; If we do this twice, the second component of the returned dual number
;; beautifully recreates the [Chain
;; Rule](https://en.wikipedia.org/wiki/Chain_rule):
;;
;; $$
;; \begin{aligned}
;; g(f(a+\varepsilon)) &= g(f(a) + Df(a)\varepsilon) \\
;; &= g(f(a)) + (Dg(f(a)))(Df(a))\varepsilon
;; \end{aligned}
;; $$
;;
;; ### Terminology Change!
;;
;; A "dual number" is a very general idea. Because we're interested in dual
;; numbers as a bookkeeping device for derivatives, we're going to specialize
;; our terminology. From now on, we'll rename $a$ and $b$ to $x$ and $x'$. Given
;; a dual number of the form $x+x'\varepsilon$: we'll refer to:
;;
;; - $x$ as the "primal" part of the dual number
;; - $x'$ as the "tangent" part
;; - $\varepsilon$ as the "tag"
;;
;; NOTE: "primal" means $x$ is tracking the "primal", or "primary", part of the
;; computation. "tangent" is a synonym for "derivative". "tag" is going to make
;; more sense shortly, when we start talking about mixing together multiple
;; $\varepsilon_1$, $\varepsilon_2$ from different computations.
;;
;; ### Binary Functions
;;
;; What about functions of more than one variable? We can use the same approach
;; by leaning on the [multivariable Taylor series
;; expansion](https://en.wikipedia.org/wiki/Taylor_series#Taylor_series_in_several_variables).
;; Take $f(x, y)$ as a binary example. If we pass dual numbers in to the taylor
;; series expansion of $f$, the $\varepsilon$ multiplication rule will erase all
;; higher-order terms, leaving us with:
;;
;; $$f(x+x'\varepsilon, y+y'\varepsilon) = f(x,y) + \partial_1 f(x,y)x' + \partial_2 f(x,y) y'$$
;;
;; NOTE: See [[lift-2]] for an implementation of this idea.
;;
;; This expansion generalizes for n-ary functions; every new argument $x_n +
;; x'_n\varepsilon$ contributes $\partial_n f(...)x'_n$ to the result.
;;
;; We can check this with the simple cases of addition, subtraction and
;; multiplication.
;;
;; The real parts of a dual number add commutatively, so we can rearrange the
;; components of a sum to get a new dual number:
;;
;; $$(x+x'\varepsilon)+(y+y'\varepsilon) == (x+y)+(x'+y')\varepsilon$$
;;
;; This matches the [sum
;; rule](https://en.wikipedia.org/wiki/Differentiation_rules#Differentiation_is_linear)
;; of differentiation, since the partials of $x + y$ with respect to either $x$
;; or $y$ both equal 1.
;;
;; Subtraction is almost identical and agrees with the [subtraction
;; rule](https://en.wikipedia.org/wiki/Differentiation_rules#Differentiation_is_linear):
;;
;; $$(x+x'\varepsilon)-(y+y'\varepsilon) == (x-y)+(x'-y')\varepsilon$$
;;
;; Multiplying out the components of two dual numbers again gives us a new dual
;; number, whose tangent component agrees with the [product
;; rule](https://en.wikipedia.org/wiki/Product_rule):
;;
;; $$
;; \begin{aligned}
;; (x+ x'\varepsilon)*(y+y'\epsilon) &= xy+(xy')\varepsilon+(x'y)\varepsilon+(x'y')\epsilon^2 \\
;; &= xy+(xy'+y'x)\varepsilon
;; \end{aligned}
;; $$
;;
;; Stare at these smaller derivations and convince yourself that they agree with
;; the Taylor series expansion method for binary functions.
;;
;; The upshot is that, armed with these techniques, we can implement a
;; higher-order `derivative` function (almost!) as simply as this:
(comment
(defn derivative [f]
(fn [x]
(extract-tangent
(f (make-dual x 1))))))
;; As long as `f` is built out of functions that know how to apply themselves to
;; dual numbers, this will all Just Work.
;;
;;
;; ### Multiple Variables, Nesting
;;
;; All of the examples above are about first-order derivatives. Taking
;; higher-order derivatives is, in theory, straightforward:
;;
(comment
(derivative
(derivative f)))
;; But this guess hits one of many subtle problems with the implementation of
;; forward-mode AD. The double-call to `derivative` will expand out to this:
(comment
(fn [x]
(letfn [(inner-d [x]
(extract-tangent
(f (make-dual x 1))))]
(extract-tangent
(inner-d
(make-dual x 1))))))
;; the `x` received by `inner-d` will ALREADY be a dual number $x+\varepsilon$!
;; This will cause two immediate problems:
;;
;; - `(make-dual x 1)` will return $(x+\varepsilon)+\varepsilon =
;; x+2\varepsilon$, which is not what we we want
;; - The `extract-tangent` call inside `inner-d` will return the `Df(x)`
;; component of the dual number... which, remember, is no longer a dual
;; number! So the SECOND call to `extract-tangent` have nothing to extract,
;; and can only sensibly return 0.
;;
;; The problem here is called "perturbation confusion", and is covered
;; beautifully in
;; ["Confusion of Tagged Perturbations in Forward Automatic Differentiation of
;; Higher-Order Functions"](https://arxiv.org/abs/1211.4892), by Manzyuk et
;; pal. (2019).
;;
;; The solution is to introduce a new $\varepsilon$ for every level, and allow
;; different $\varepsilon$ instances to multiply without annihalating. Each
;; $\varepsilon$ is called a "tag". [[Differential]] (implemented below) is a
;; generalized dual number that can track many tags at once, allowing nested
;; derivatives like the one described above to work.
;;
;; This implies that `extract-tangent` needs to take a tag, to determine _which_
;; tangent to extract:
(comment
(defn derivative [f]
(let [tag (fresh-tag)]
(fn [x]
(-> (f (make-dual x 1 tag))
(extract-tangent tag))))))
;; This is close to the final form you'll find
;; in [[sicmutils.calculus.derivative/derivative]].
;;
;; ### What Return Values are Allowed?
;;
;; Before we discuss the implementation of dual
;; numbers (called [[Differential]]), [[lift-1]], [[lift-2]] and the rest of the
;; machinery that makes this all possible; what sorts of objects is `f` allowed
;; to return?
;;
;; The dual number approach is beautiful because we can bring to bear all sorts
;; of operations in Clojure that never even _see_ dual numbers. For example,
;; `square-and-cube` called with a dual number returns a PAIR of dual numbers:
(comment
(defn square-and-cube [x]
(let [x2 (g/square x)
x3 (g/cube x)]
[x2 x3])))
;; Vectors don't care what they hold! We want the derivative of
;; `square-and-cube` to also return a vector, whose entries represent the
;; derivative of _that entry_ with respect to the function's input.
;;
;; But this implies that [[extract-tangent]] from the example above needs to
;; know how to handle vectors and other collections; in the case of a vector `v`
;; by returning `(mapv extract-tangent v)`
;;
;; The dual number implementation is called [[Differential]]; the way
;; that [[Differential]] instances interact with container types in SICMUtils
;; makes it easy for these captures to occur all over. Whenever we multiply
;; a [[Differential]] by a structure, a function, a vector, any of those things,
;; our implementation of the SICMUtils generics pushes the [[Differential]]
;; inside those objects, rather than forming a [[Differential]] with, for
;; example, a vector in the primal and tangent parts.
;;
;; Functions... interesting. what _about_ higher-order functions?
(comment
(defn offset-fn
"Returns a function that takes a single-argument function `g`, and returns a new
function like `g` that offsets its input by `offset`."
[offset]
(fn [g]
(fn [x]
(g (+ x offset))))))
;; `(derivative offset-fn)` here returns a function! Ideally we'd like the
;; returned function to act exactly like:
(comment
(derivative
(fn [offset] (g (+ x offset)))))
;; for some known `g` and `x`, but with the ability to store `(derivative
;; offset-fn)` and call it later with many different `g`.
;;
;; We might accomplish this by composing `extract-tangent` with the returned
;; function, so that the extraction happens later, when the function's called.
;;
;; NOTE: The real implementation is more subtle! See
;; the [[sicmutils.calculus.derivative]] namespace for the actual implementation
;; of [[IPerturbed]] for functions and multimethods.
;;
;; All of this suggests that we need to make [[extract-tangent]] an open
;; function that other folks can extend for other container-like
;; types ([functors](https://en.wikipedia.org/wiki/Functor), specifically).
;;
;; The [[IPerturbed]] protocol accomplishes this, along with two other functions
;; that we'll use later:
(defprotocol IPerturbed
(perturbed? [this]
"Returns true if the supplied object has some known non-zero tangent to be
extracted via [[extract-tangent]], false otherwise. (Return `false` by
default if you can't detect a perturbation.)")
(replace-tag [this old-tag new-tag]
"If `this` is perturbed, Returns a similar object with the perturbation
modified by replacing any appearance of `old-tag` with `new-tag`. Else,
return `this`.")
(extract-tangent [this tag]
"If `this` is perturbed, return the tangent component paired with the
supplied tag. Else, returns `([[sicmutils.value/zero-like]] this)`."))
;; `replace-tag` exists to handle subtle bugs that can arise in the case of
;; functional return values. See the "Amazing Bug" sections
;; in [[sicmutils.calculus.derivative-test]] for detailed examples on how this
;; might bite you.
;;
;; The default implementations are straightforward, and match the docstrings:
(extend-protocol IPerturbed
#?(:clj Object :cljs default)
(perturbed? [_] false)
(replace-tag [this _ _] this)
(extract-tangent [this _] (v/zero-like this)))
;; ## Differential Implementation
;;
;; We now have a template for how to implement `derivative`. What's left? We
;; need a dual number type that we can build and split back out into primal and
;; tangent components, given some tag. We'll call this type a [[Differential]].
;;
;; A [[Differential]] is a generalized dual number with a single primal
;; component, and potentially many tagged terms. Identical tags cancel to 0 when
;; multiplied, but are allowed to multiply by each other:
;;
;; $$a + b\varepsilon_1 + c\varepsilon_2 + d\varepsilon_1 \varepsilon_2 + \cdots$$
;;
;; Alternatively, you can view a [[Differential]] as a dual number with a
;; specific tag, that's able to hold dual numbers with some _other_ tag in its
;; primal and tangent slots. You can turn a [[Differential]] into a dual number
;; by specifying one of its tags. Here are the primal and tangent components for
;; $\varepsilon_2$:
;;
;; $$(a + b\varepsilon_1) + (c + d\varepsilon_1)\varepsilon_2$$
;;
;; And for $\varepsilon_1$:
;;
;; $$(a + c\varepsilon_2) + (b + d \varepsilon_2) \varepsilon_1$$
;;
;; A differential term is implemented as a pair whose first element is a set of
;; tags and whose second is the coefficient.
(def ^:private tags first)
(def ^:private coefficient peek)
;; The set of tags is implemented as a "vector set",
;; from [[sicmutils.util.vector-set]]. This is a sorted set data structure,
;; backed by a Clojure vector. vector sets provide cheap "max" and "min"
;; operations, and acceptable union, intersection and difference performance.
(defn- make-term
"Returns a [[Differential]] term with the supplied vector-set of `tags` paired
with coefficient `coef`.
`tags` defaults to [[uv/empty-set]]"
([coef] [uv/empty-set coef])
([tags coef] [tags coef]))
;; Since the only use of a tag is to distinguish each unnamed $\varepsilon_n$,
;; we'll assign a new, unique positive integer for each new tag:
(let [next-tag (atom 0)]
(defn fresh-tag
"Returns a new, unique tag for use inside of a [[Differential]] term list."
[]
(swap! next-tag inc)))
(defn- tag-in-term?
"Return true if `t` is in the tag-set of the supplied `term`, false otherwise."
[term t]
(uv/contains? (tags term) t))
;; ## Term List Algebra
;;
;; The discussion above about Taylor series expansions revealed that for single
;; variable functions, we can pass a dual number into any function whose
;; derivative we already know:
;;
;; $$f(a+b\varepsilon) = f(a) + (Df(a)b)\varepsilon$$
;;
;; Because we can split a [[Differential]] into a primal and tangent component
;; with respect to some tag, we can reuse this result. We'll default to
;; splitting [[Differential]] instances by the highest-index tag:
;;
;; $$
;; \begin{aligned}
;; f(a &+ b\varepsilon_1 + c\varepsilon_2 + d\varepsilon_1 \varepsilon_2) \\
;; &= f((a + b\varepsilon_1)+(c + d\varepsilon_1)\varepsilon_2) \\
;; &= f(a + b\varepsilon_1)+Df(a + b\varepsilon_1)(c + d\varepsilon_1)\varepsilon_2 \\
;; \end{aligned}
;; $$
;;
;; Note that $f$ and $Df$ both received a dual number! One more expansion, this
;; time in $\varepsilon_1$, completes the evaluation (and makes abundantly clear
;; why we want the computer doing this, not pencil-and-paper):
;;
;; $$
;; \begin{aligned}
;; f(a &+ b\varepsilon_1)+Df(a+b\varepsilon_1)(c+d\varepsilon_1)\varepsilon_2 \\
;; &= (f(a)+Df(a)b\varepsilon_1)+(Df(a)+D^2f(a)b\varepsilon_1)(c + d\varepsilon_1)\varepsilon_2 \\
;; &= f(a)+(Df(a)b+D^2f(a)bc)\varepsilon_1+Df(a)c\varepsilon_2+Df(a)d\varepsilon_1\varepsilon_2
;; \end{aligned}
;; $$
;;
;; The only operations we need to implement between lists of terms are addition
;; and multiplication.
;;
;; ### Add and Multiply
;;
;; To efficiently add two [[Differential]] instances (represented as lists of
;; terms), we keep all terms in sorted order, sorted first by the length of each
;; tag list (the "order" of the differential term), and secondarily by the
;; values of the tags inside the tag list.
;;
;; NOTE: Clojure vectors already implement this ordering properly, so we can
;; use [[clojure.core/compare]] to determine an ordering on a tag list.
(defn- terms:+
"Returns the sum of the two supplied sequences of differential terms; any terms
in the result with a zero coefficient will be removed.
Each input must be sequence of `[tag-set, coefficient]` pairs, sorted by
`tag-set`."
[xs ys]
(loop [xs xs, ys ys, result []]
(cond (empty? xs) (into result ys)
(empty? ys) (into result xs)
:else (let [[x-tags x-coef :as x] (first xs)
[y-tags y-coef :as y] (first ys)
compare-flag (core-compare x-tags y-tags)]
(cond
;; If the terms have the same tag set, add the coefficients
;; together. Include the term in the result only if the new
;; coefficient is non-zero.
(zero? compare-flag)
(let [sum (g/+ x-coef y-coef)]
(recur (rest xs)
(rest ys)
(if (v/zero? sum)
result
(conj result (make-term x-tags sum)))))
;; Else, pass the smaller term on unchanged and proceed.
(neg? compare-flag)
(recur (rest xs) ys (conj result x))
:else
(recur xs (rest ys) (conj result y)))))))
;; Because we've decided to store terms as a vector, we can multiply two vectors
;; of terms by:
;;
;; - taking the cartesian product of both term lists
;; - discarding all pairs of terms that share any tag (since $\varepsilon^2=0$)
;; - multiplying the coefficients of all remaining pairs and union-ing their tag
;; lists
;; - grouping and adding any new terms with the SAME list of tags, and filtering
;; out zeros
;;
;; This final step is required by a number of different operations later, so we
;; break it out into its own [[collect-terms]] function:
(defn- collect-terms
"Build a term list up of pairs of tags => coefficients by grouping together and
summing coefficients paired with the same term list.
The result will be sorted by term list, and contain no duplicate term lists."
[tags->coefs]
(let [terms (for [[tags tags-coefs] (group-by tags tags->coefs)
:let [c (transduce (map coefficient) g/+ tags-coefs)]
:when (not (v/zero? c))]
[tags c])]
(into [] (sort-by tags terms))))
;; [[terms:*]] implements the first three steps, and calls [[collect-terms]] on
;; the resulting sequence:
(defn- terms:*
"Returns a vector of non-zero [[Differential]] terms that represent the product
of the differential term lists `xs` and `ys`."
[xs ys]
(collect-terms
(for [[x-tags x-coef] xs
[y-tags y-coef] ys
:when (empty? (uv/intersection x-tags y-tags))]
(make-term (uv/union x-tags y-tags)
(g/* x-coef y-coef)))))
;; ## Differential Type Implementation
;;
;; Armed with our term list arithmetic operations, we can finally implement
;; our [[Differential]] type and implement a number of important Clojure and
;; SICMUtils protocols.
;;
;; A [[Differential]] will respond to [[v/kind]] with `::differential`. Because
;; we want [[Differential]] instances to work in any place that real numbers or
;; symbolic argument work, let's make `::differential` derive from `::v/scalar`:
(derive ::differential ::v/scalar)
;; Now the actual type. The `terms` field is a term-list vector that will
;; remain (contractually!) sorted by its list of tags.
(declare d:apply compare equiv from-terms one?)
(deftype Differential [terms]
;; A [[Differential]] as implemented can act as a chain-rule accounting device
;; for all sorts of types, not just numbers. A [[Differential]] is
;; only [[v/numerical?]] if its coefficients are numerical.
v/Numerical
(numerical? [_]
(v/numerical? (coefficient (first terms))))
IPerturbed
(perturbed? [_] true)
;; There are 3 cases to consider when replacing some tag in a term, annotated
;; below:
(replace-tag [_ oldtag newtag]
(letfn [(process [term]
(let [tagv (tags term)]
(if-not (uv/contains? tagv oldtag)
;; if the term doesn't contain the old tag, ignore it.
[term]
(if (uv/contains? tagv newtag)
;; if the term _already contains_ the new tag
;; $\varepsilon_{new}$, then replacing $\varepsilon_1$
;; with a new instance of $\varepsilon_2$ would cause a
;; clash. Since $\varepsilon_2^2=0$, the term should be
;; removed.
[]
;; else, perform the replacement.
[(make-term (-> tagv
(uv/disj oldtag)
(uv/conj newtag))
(coefficient term))]))))]
(from-terms
(mapcat process terms))))
;; To extract the tangent (with respect to `tag`) from a differential, return
;; all terms that contain the tag (with the tag removed!) This can create
;; duplicate terms, so use [[from-terms]] to massage the result into
;; well-formedness again.
(extract-tangent [_ tag]
(from-terms
(mapcat (fn [term]
(let [tagv (tags term)]
(if (uv/contains? tagv tag)
[(make-term (uv/disj tagv tag)
(coefficient term))]
[])))
terms)))
v/Value
(zero? [this]
(every? (comp v/zero? coefficient) terms))
(one? [this] (one? this))
(identity? [this] (one? this))
(zero-like [_] 0)
(one-like [_] 1)
(identity-like [_] 1)
(freeze [_] `[~'Differential ~@terms])
(exact? [_] false)
(kind [_] ::differential)
Object
;; Comparing [[Differential]] objects using `equals` defaults to [[equiv]],
;; which compares instances only using their non-tagged ('finite') components.
;; If you want to compare two instances using their full term lists,
;; See [[eq]].
#?(:clj (equals [a b] (equiv a b)))
(toString [_] (str "D[" (join " " (map #(join " → " %) terms)) "]"))
;; Because a [[Differential]] is an accounting device that augments other
;; operations with the ability to carry around derivatives, it's possible that
;; the coefficient slots could be occupied by function objects. If so, then it
;; becomes possible to "apply" a [[Differential]] to some arguments (apply
;; each coefficient to the arguments).
;;
;; TODO the arity, if anyone cares to ask, might be better implemented as the
;; joint arity of all coefficients; but my guess here is that the tangent
;; terms all have to be tracking derivatives of the primal term, which have to
;; have the same arity by definition.
f/IArity
(arity [_]
(f/arity (coefficient (first terms))))
#?@(:clj
;; This one is slightly subtle. To participate in control flow operations,
;; like comparison with both [[Differential]] and non-[[Differential]]
;; numbers, [[Differential]] instances should compare using ONLY their
;; non-tagged ("finite") terms. This means that comparison will totally
;; ignore any difference in tags.
[Comparable
(compareTo [a b] (compare a b))
IFn
(invoke [this]
(d:apply this []))
(invoke [this a]
(d:apply this [a]))
(invoke [this a b]
(d:apply this [a b]))
(invoke [this a b c]
(d:apply this [a b c]))
(invoke [this a b c d]
(d:apply this [a b c d]))
(invoke [this a b c d e]
(d:apply this [a b c d e]))
(invoke [this a b c d e f]
(d:apply this [a b c d e f]))
(invoke [this a b c d e f g]
(d:apply this [a b c d e f g]))
(invoke [this a b c d e f g h]
(d:apply this [a b c d e f g h]))
(invoke [this a b c d e f g h i]
(d:apply this [a b c d e f g h i]))
(invoke [this a b c d e f g h i j]
(d:apply this [a b c d e f g h i j]))
(invoke [this a b c d e f g h i j k]
(d:apply this [a b c d e f g h i j k]))
(invoke [this a b c d e f g h i j k l]
(d:apply this [a b c d e f g h i j k l]))
(invoke [this a b c d e f g h i j k l m]
(d:apply this [a b c d e f g h i j k l m]))
(invoke [this a b c d e f g h i j k l m n]
(d:apply this [a b c d e f g h i j k l m n]))
(invoke [this a b c d e f g h i j k l m n o]
(d:apply this [a b c d e f g h i j k l m n o]))
(invoke [this a b c d e f g h i j k l m n o p]
(d:apply this [a b c d e f g h i j k l m n o p]))
(invoke [this a b c d e f g h i j k l m n o p q]
(d:apply this [a b c d e f g h i j k l m n o p q]))
(invoke [this a b c d e f g h i j k l m n o p q r]
(d:apply this [a b c d e f g h i j k l m n o p q r]))
(invoke [this a b c d e f g h i j k l m n o p q r s]
(d:apply this [a b c d e f g h i j k l m n o p q r s]))
(invoke [this a b c d e f g h i j k l m n o p q r s t]
(d:apply this [a b c d e f g h i j k l m n o p q r s t]))
(applyTo [this xs] (AFn/applyToHelper this xs))]
:cljs
[IEquiv
(-equiv [a b] (equiv a b))
IComparable
(-compare [a b] (compare a b))
IPrintWithWriter
(-pr-writer [x writer _]
(write-all writer (.toString x)))
IFn
(-invoke [this]
(d:apply this []))
(-invoke [this a]
(d:apply this [a]))
(-invoke [this a b]
(d:apply this [a b]))
(-invoke [this a b c]
(d:apply this [a b c]))
(-invoke [this a b c d]
(d:apply this [a b c d]))
(-invoke [this a b c d e]
(d:apply this [a b c d e]))
(-invoke [this a b c d e f]
(d:apply this [a b c d e f]))
(-invoke [this a b c d e f g]
(d:apply this [a b c d e f g]))
(-invoke [this a b c d e f g h]
(d:apply this [a b c d e f g h]))
(-invoke [this a b c d e f g h i]
(d:apply this [a b c d e f g h i]))
(-invoke [this a b c d e f g h i j]
(d:apply this [a b c d e f g h i j]))
(-invoke [this a b c d e f g h i j k]
(d:apply this [a b c d e f g h i j k]))
(-invoke [this a b c d e f g h i j k l]
(d:apply this [a b c d e f g h i j k l]))
(-invoke [this a b c d e f g h i j k l m]
(d:apply this [a b c d e f g h i j k l m]))
(-invoke [this a b c d e f g h i j k l m n]
(d:apply this [a b c d e f g h i j k l m n]))
(-invoke [this a b c d e f g h i j k l m n o]
(d:apply this [a b c d e f g h i j k l m n o]))
(-invoke [this a b c d e f g h i j k l m n o p]
(d:apply this [a b c d e f g h i j k l m n o p]))
(-invoke [this a b c d e f g h i j k l m n o p q]
(d:apply this [a b c d e f g h i j k l m n o p q]))
(-invoke [this a b c d e f g h i j k l m n o p q r]
(d:apply this [a b c d e f g h i j k l m n o p q r]))
(-invoke [this a b c d e f g h i j k l m n o p q r s]
(d:apply this [a b c d e f g h i j k l m n o p q r s]))
(-invoke [this a b c d e f g h i j k l m n o p q r s t]
(d:apply this [a b c d e f g h i j k l m n o p q r s t]))
(-invoke [this a b c d e f g h i j k l m n o p q r s t rest]
(d:apply this (concat [a b c d e f g h i j k l m n o p q r s t] rest)))]))
#?(:clj
(defmethod print-method Differential
[^Differential s ^java.io.Writer w]
(.write w (.toString s))))
;; ## Accessor Methods
(defn differential?
"Returns true if the supplied object is an instance of `Differential`, false
otherwise."
[dx]
(instance? Differential dx))
(defn- bare-terms
"Returns the `-terms` field of the supplied `Differential` object. Errors if any
other type is supplied."
[dx]
{:pre [(differential? dx)]}
(.-terms ^Differential dx))
;; ## Constructors
;;
;; Because a [[Differential]] is really a wrapper around the idea of a
;; generalized dual number represented as a term-list, we need to be able to get
;; to and from the term list format from other types, not just [[Differential]]
;; instances.
(defn- ->terms
"Returns a vector of terms that represent the supplied [[Differential]]; any
term with a [[v/zero?]] coefficient will be filtered out before return.
If you pass a non-[[Differential]], [[->terms]] will return a singleton term
list (or `[]` if the argument was zero)."
[dx]
(cond (differential? dx)
(filterv (fn [term]
(not (v/zero? (coefficient term))))
(bare-terms dx))
(v/zero? dx) []
:else [(make-term dx)]))
(defn- terms->differential
"Returns a differential instance generated from a vector of terms. This method
will do some mild cleanup, or canonicalization:
- any empty term list will return 0
- a singleton term list with no tags will return its coefficient
NOTE this method assumes that the input is properly sorted, and contains no
zero coefficients."
[terms]
{:pre [(vector? terms)]}
(cond (empty? terms) 0
(and (= (count terms) 1)
(empty? (tags (first terms))))
(coefficient (first terms))
:else (->Differential terms)))
(defn from-terms
"Accepts a sequence of terms (pairs of [tag-list, coefficient]), and returns:
- a well-formed [[Differential]] instance, if the terms resolve to a
differential with non-zero infinitesimal terms
- the original input otherwise
Duplicate (by tag list) terms are allowed; their coefficients will be summed
together and removed if they sum to zero."
[tags->coefs]
(terms->differential
(collect-terms tags->coefs)))
;; ## Differential API
;;
;; This next section lifts slightly-augmented versions of [[terms:+]]
;; and [[terms:*]] up to operate on [[Differential]] instances. These work just
;; as before, but handle wrapping and unwrapping the term list.
(defn d:+
"Returns an object representing the sum of the two objects `dx` and `dy`. This
works by summing the coefficients of all terms with the same list of tags.
Works with non-[[Differential]] instances on either or both sides, and returns
a [[Differential]] only if it contains any non-zero tangent components."
[dx dy]
(terms->differential
(terms:+ (->terms dx)
(->terms dy))))
(defn d:*
"Returns an object representing the product of the two objects `dx` and `dy`.
This works by multiplying out all terms:
$$(dx1 + dx2 + dx3 + ...)(dy1 + dy2 + dy3...)$$
and then collecting any duplicate terms by summing their coefficients.
Works with non-[[Differential]] instances on either or both sides, and returns
a [[Differential]] only if it contains any non-zero tangent components."
[dx dy]
(terms->differential
(terms:* (->terms dx)
(->terms dy))))
(defn- d:apply
"Accepts a [[Differential]] and a sequence of `args`, interprets each
coefficient as a function and returns a new [[Differential]] generated by
applying the coefficient to `args`."
[diff args]
(terms->differential
(into [] (mapcat (fn [term]
(let [result (apply (coefficient term) args)]
(if (v/zero? result)
[]
[(make-term (tags term) result)]))))
(->terms diff))))
;; Finally, the function we've been waiting for! [[bundle]] allows you to
;; augment some non-[[Differential]] thing with a tag and push it through the
;; generic arithmetic system, where it will accumulate the derivative of your
;; original input (tagged with `tag`.)
(defn bundle
"Generate a new [[Differential]] object with the supplied `primal` and `tangent`
components, and the supplied internal `tag` that this [[Differential]] will
carry around to prevent perturbation confusion.
If the `tangent` component is `0`, acts as identity on `primal`. `tangent`
defaults to 1.
`tag` defaults to a side-effecting call to [[fresh-tag]]; you can retrieve
this unknown tag by calling [[max-order-tag]]."
([primal]
(bundle primal 1 (fresh-tag)))
([primal tag]
(bundle primal 1 tag))
([primal tangent tag]
(let [term (make-term (uv/make [tag]) tangent)]
(d:+ primal (->Differential [term])))))
;; ## Differential Parts API
;;
;; These functions give higher-level access to the components of
;; a [[Differential]] you're typically interested in.
(defn max-order-tag
"Given one or more well-formed [[Differential]] objects, returns the
maximum ('highest order') tag found in the highest-order term of any of
the [[Differential]] instances.
If there is NO maximal tag (ie, if you provide [[Differential]] instances with
no non-zero tangent parts, or all non-[[Differential]]s), returns nil."
([dx]
(when (differential? dx)
(let [last-term (peek (->terms dx))
highest-tag (peek (tags last-term))]
highest-tag)))
([dx & dxs]
(letfn [(max-termv [dx]
(if-let [max-order (max-order-tag dx)]
[max-order]
[]))]
(when-let [orders (seq (mapcat max-termv (cons dx dxs)))]
(apply max orders)))))
;; A reminder: the [[primal-part]] of a [[Differential]] is all terms except for
;; terms containing [[max-order-tag]], and [[tangent-part]] is
;; a [[Differential]] built out of the remaining terms, all of which contain
;; that tag.
(defn primal-part
"Returns a [[Differential]] containing only the terms of `dx` that do NOT
contain the supplied `tag`, ie, the primal component of `dx` with respect to
`tag`.
If no tag is supplied, defaults to `([[max-order-tag]] dx)`.
NOTE: every [[Differential]] can be factored into a dual number of the form
primal + (tangent * tag)
For each tag in any of its terms. [[primal-part]] returns this first piece,
potentially simplified into a non-[[Differential]] if the primal part contains
no other tags."
([dx] (primal-part dx (max-order-tag dx)))
([dx tag]
(if (differential? dx)
(let [sans-tag? #(not (tag-in-term? % tag))]
(->> (->terms dx)
(filterv sans-tag?)
(terms->differential)))
dx)))
(defn tangent-part
"Returns a [[Differential]] containing only the terms of `dx` that contain the
supplied `tag`, ie, the tangent component of `dx` with respect to `tag`.
If no tag is supplied, defaults to `([[max-order-tag]] dx)`.
NOTE: Every [[Differential]] can be factored into a dual number of the form
primal + (tangent * tag)
For each tag in any of its terms. [[tangent-part]] returns a [[Differential]]
representing `(tangent * tag)`, or 0 if `dx` contains no terms with the
supplied `tag`.
NOTE: the 2-arity case is similar to `([[extract-tangent]] dx tag)`; the only
difference is that [[extract-tangent]] drops the `dx` tag from all terms in
the returned value. Call [[extract-tangent]] if you want to drop `tag`."
([dx] (tangent-part dx (max-order-tag dx)))
([dx tag]
(if (differential? dx)
(->> (->terms dx)
(filterv #(tag-in-term? % tag))
(terms->differential))
0)))
(defn primal-tangent-pair
"Returns a pair of the primal and tangent components of the supplied `dx`, with
respect to the supplied `tag`. See the docs for [[primal-part]]
and [[tangent-part]] for more details.
[[primal-tangent-pair]] is equivalent to
`[([[primal-part]] dx tag) ([[tangent-part]] dx tag)]`
but slightly more efficient if you need both."
([dx] (primal-tangent-pair dx (max-order-tag dx)))
([dx tag]
(if-not (differential? dx)
[dx 0]
(let [[tangent-terms primal-terms]
(us/separatev #(tag-in-term? % tag)
(->terms dx))]
[(terms->differential primal-terms)