-
Notifications
You must be signed in to change notification settings - Fork 37
/
ergm.R
882 lines (817 loc) · 42.2 KB
/
ergm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
# File R/ergm.R in package ergm, part of the
# Statnet suite of packages for network analysis, https://statnet.org .
#
# This software is distributed under the GPL-3 license. It is free,
# open source, and has the attribution requirements (GPL Section 7) at
# https://statnet.org/attribution .
#
# Copyright 2003-2022 Statnet Commons
################################################################################
###############################################################################
# The <ergm> function fits ergms from a specified formula returning either
# MPLEs or approximate MLE's based on MCMC estimation.
#
#
# --PARAMETERS--
# formula : a formula of the form 'nw ~ model term(s)'
# init : a vector of starting values for estimation or offset values, or optionally
# if these are to be estimated, NULL (the default);
# constraints : a one-sided formula of the constraint terms; options are
# bd degrees nodegrees
# edges degreedist idegreedist
# observed odegreedist
# default="~ ."
# target.stats : a vector of the mean value parameters;
# default=the observed statistic from the 'nw' in formula
# control : a list of control parameters returned from <control.ergm>;
# default=control.ergm()
# verbose : whether ergm should be verbose (T or F); default=FALSE
#
#
# --RETURNED--
# because a stergm object is the return type of several functions, and
# because this is a rather lengthy list, and because the returned items
# of this function borrow from the other stergm.* functions, this list
# provides the returned items for all funtions returning a stergm.
# The symbol preceding each component indicates which function returns it,
# but remember that, <stergm> will additionally return the items from
# one of the other stergm functions as well:
# because an ergm object is the return type of several functions, and
# because this is a rather lengthy list, this list represents the return
# type for all funtions returning an ergm. The symbol preceding each
# component indicates which function returns it:
# <ergm> = $
# <ergm.mainfitloop> = *
# <ergm.mple> = !
# <ergm.stepping> = @
# <ergm.stocapprox> = %
# <ergm.estimate> = ^
# <ergm.robmon> = &
#
# the components include:
#
# $*!@%^&#~+ coef : the vector of estimated model coefficients
# $* @%^& + sample : the row-binded matrix of network statistics from
# each sample; 'sample' will also have 2 attributes:
# mcpar : the following vector taken from control:
# c(burnin+1, endrun, interval)
# where 'endrun' is defined as
# burnin+interval*(samplesize-1)
# class : "mcmc"
# $*!@%^&#~+ iterations : the number of Newton-Raphson iterations required
# before convergence
# $*!@%^&#~+ MCMCtheta : the vector of natural parameters used to produce
# the MCMC sample
# $* @%^& + loglikelihood : the estimated change in log-likelihood in the last
# iteration
# $*!@%^&#~+ gradient : the value of the gradient of the approximated log-
# likelihood function at the maximizing value
# $*!@%^&#~+ covar : the approximated covariance matrix for the MLE
# $*!@%^&#~+ samplesize : the size of the MCMC sample
# $*!@%^&#~+ failure : whether estimation failed (T or F)
# $*!@%^&#~+ mc.se : the standard error estimates
# $* @% &# + newnetwork : the final network sampled; in the ergm returned from
# <ergm.robmom>, this='network'
# $* @% &# + network : the 'nw' inputted to <ergm> via the 'formula'
# $* @% & + theta.original : the theta values at the start of the MCMC sampling
# $* @ mplefit : the MPLE fit as a glm object, and returned by
# <ergm.mple>
# $*!@% &#~+ mle.lik : the approximate log-likelihood for the MLE, if computed
# $* @ etamap : the set of function mapping theta -> eta;
# see <etamap>? for the components of this list
# $ # formula : the 'formula' value inputted to <ergm>
# $ # constraints : the 'constraints' value inputted to <ergm>
# # prop.args : the list of arguments that were passed onto the
# <InitErgmProposal> routines
# $ # prop.weights : the MCMC proposal weights inputted to <ergm> via
# 'control'
# $ # offset : a vector of whether each model parameter was set at
# a fixed value (not estimated)
# $ # drop : list of dropped terms
# * @%^& + sample.obs : the matrix of sample network statistics for observed
# data
# * @ parallel : the number of additional threads used when sampling
# ! #~ glm : the fit established by MPL estimation and returned
# by <ergm.logitreg>, <ergm.pen.glm> or <glm>
# depending on the 'MPLEtype';
# ! #~ glm.null : the null fit established by MPL estimation and
# returned by <ergm.logitreg>, <ergm.pen.glm> or <glm>
# depending on the 'MPLEtype';
# & rm.coef : the robmon coefficients used as 'init' in the final
# estimation
# ! #~ loglikelihoodratio: the log-likelihood corresponding to
# 'coef'
#
#####################################################################################
#' Exponential-Family Random Graph Models
#'
#' \code{\link{ergm}} is used to fit exponential-family random graph
#' models (ERGMs), in which
#' the probability of a given network, \eqn{y}, on a set of nodes is
#' \eqn{h(y) \exp\{\eta(\theta) \cdot
#' g(y)\}/c(\theta)}, where
#' \eqn{h(y)} is the reference measure (usually \eqn{h(y)=1}),
#' \eqn{g(y)} is a vector of network statistics for \eqn{y},
#' \eqn{\eta(\theta)} is a natural parameter vector of the same
#' length (with \eqn{\eta(\theta)=\theta} for most terms), and \eqn{c(\theta)} is the
#' normalizing constant for the distribution.
#' \code{\link{ergm}} can return a maximum pseudo-likelihood
#' estimate, an approximate maximum likelihood estimate based on a Monte
#' Carlo scheme, or an approximate contrastive divergence estimate based
#' on a similar scheme.
#' (For an overview of the package, see \code{\link{ergm-package}}.)
#'
#' @param formula An \R \code{\link{formula}} object, of the form
#' \code{y ~ <model terms>}, where \code{y} is a
#' \code{\link[network]{network}} object or a matrix that can be
#' coerced to a \code{\link[network]{network}} object. For the
#' details on the possible \code{<model terms>}, see
#' \code{\link{ergmTerm}} and Morris, Handcock and Hunter (2008)
#' for binary ERGM terms and Krivitsky (2012) for valued ERGM terms
#' (terms for weighted edges). To create a
#' \code{\link[network]{network}} object in \R, use the
#' \code{network()} function, then add nodal attributes to it using
#' the \code{\%v\%} operator if necessary. Enclosing a model term in
#' \code{offset()} fixes its value to one specified in
#' \code{offset.coef}. (A second argument---a logical or numeric
#' index vector---can be used to select *which* of the parameters
#' within the term are offsets.)
#'
#' @template response
#' @template reference
#' @template constraints
#'
#' @param obs.constraints A one-sided formula specifying one or more
#' constraints or other modification \emph{in addition} to those
#' specified by \code{constraints}, following the same syntax as the
#' `constraints` argument.
#'
#' This allows the domain of the integral in the numerator of the
#' partially obseved network face-value likelihoods of Handcock and
#' Gile (2010) and Karwa et al. (2017) to be specified explicitly.
#'
#' The default is to constrain the integral to only integrate over
#' the missing dyads (if present), after incorporating constraints
#' provided through the [`ergmlhs`] API.
#'
#' It is also possible to specify a proposal function directly by
#' passing a string with the function's name of the `obs.MCMC.prop`
#' argument to the relevant control function. In that case,
#' arguments to the proposal should be specified through the
#' \code{obs.prop.args} argument to the relevant control function.
#'
#' @param offset.coef {A vector of coefficients for the offset terms.}
#' @param target.stats {vector of "observed network statistics,"
#' if these statistics are for some reason different than the
#' actual statistics of the network on the left-hand side of
#' \code{formula}.
#' Equivalently, this vector is the mean-value parameter values for the
#' model. If this is given, the algorithm finds the natural
#' parameter values corresponding to these mean-value parameters.
#' If \code{NULL}, the mean-value parameters used are the observed
#' statistics of the network in the formula.
#' }
#' @param eval.loglik {Logical: For dyad-dependent models, if TRUE, use bridge
#' sampling to evaluate the log-likelihoood associated with the
#' fit. Has no effect for dyad-independent models.
#' Since bridge sampling takes additional time, setting to FALSE may
#' speed performance if likelihood values (and likelihood-based
#' values like AIC and BIC) are not needed. Can be set globally via `option(ergm.eval.loglik=...)`, which is set to `TRUE` when the package is loaded. (See [`options?ergm`][ergm-options].)
#' }
#' @param estimate {If "MPLE," then the maximum pseudolikelihood estimator
#' is returned. If "MLE" (the default), then an approximate maximum likelihood
#' estimator is returned. For certain models, the MPLE and MLE are equivalent,
#' in which case this argument is ignored. (To force MCMC-based approximate
#' likelihood calculation even when the MLE and MPLE are the same, see the
#' \code{force.main} argument of \code{\link{control.ergm}}. If "CD" (\emph{EXPERIMENTAL}),
#' the Monte-Carlo contrastive divergence estimate is returned. )
#' }
#'
#' @templateVar mycontrol control.ergm
#' @template control
#' @template verbose
#'
#' @param \dots Additional
#' arguments, to be passed to lower-level functions.
#'
#' @template basis
#'
#' @return
#' \code{\link{ergm}} returns an object of class \code{\link{ergm}} that is a list
#' consisting of the following elements:
#' \item{coef}{The Monte Carlo maximum likelihood estimate
#' of \eqn{\theta}, the vector of coefficients for the model
#' parameters.}
#' \item{sample}{The \eqn{n\times p} matrix of network statistics,
#' where \eqn{n} is the
#' sample size and \eqn{p} is the number of network statistics specified in the
#' model, generated by the last iteration of the MCMC-based likelihood maximization routine. These statistics are centered with respect to the observed statistics or `target.stats`, unless missing data MLE is used.}
#' \item{sample.obs}{As \code{sample}, but for the constrained sample.}
#' \item{iterations}{The number of Newton-Raphson iterations required
#' before convergence.}
#' \item{MCMCtheta}{The value of \eqn{\theta} used to produce the Markov chain
#' Monte Carlo sample. As long as the Markov chain mixes sufficiently
#' well, \code{sample} is roughly a random sample from the distribution
#' of network statistics specified by the model with the parameter equal
#' to \code{MCMCtheta}. If \code{estimate="MPLE"} then
#' \code{MCMCtheta} equals the MPLE.}
#' \item{loglikelihood}{The approximate change in log-likelihood
#' in the last iteration.
#' The value is only approximate because it is estimated based
#' on the MCMC random sample.}
#' \item{gradient}{The value of the gradient vector of the approximated
#' loglikelihood function, evaluated at the maximizer. This vector
#' should be very close to zero.}
#' \item{covar}{Approximate covariance matrix for the MLE, based on the inverse
#' Hessian of the approximated loglikelihood evaluated at the maximizer.}
#' \item{failure}{Logical: Did the MCMC estimation fail?}
#' \item{network}{Network passed on the left-hand side of `formula`. If `target.stats` are passed, it is replaced by the network returned by [san()].}
#' \item{newnetworks}{A list of the final networks at the end of the MCMC
#' simulation, one for each thread.}
#' \item{newnetwork}{The first (possibly only) element of \code{newnetworks}.}
#' \item{coef.init}{The initial value of \eqn{\theta}.}
#' \item{est.cov}{The covariance matrix of the model statistics in the final MCMC sample.}
#' \item{coef.hist, steplen.hist, stats.hist, stats.obs.hist}{
#' For the MCMLE method, the history of coefficients, Hummel step lengths, and average model statistics for each iteration..
#' }
#' \item{control}{The control list passed to the call.}
#' \item{etamap}{The set of functions mapping the true parameter theta
#' to the canonical parameter eta (irrelevant except in a curved exponential
#' family model)}
#' \item{formula}{The original \code{\link{formula}} entered into the \code{\link{ergm}} function.}
#' \item{target.stats}{The target.stats used during estimation (passed through from the Arguments)}
#' \item{target.esteq}{Used for curved models to preserve the target mean values of the curved terms. It is identical to target.stats for non-curved models.}
#' \item{constrained}{The list of constraints implied by the constraints used by original \code{ergm} call}
#' \item{constraints}{Constraints used during estimation (passed through from the Arguments)}
#' \item{reference}{The reference measure used during estimation (passed through from the Arguments)}
#' \item{estimate}{The estimation method used (passed through from the Arguments).}
#' \item{offset}{vector of logical telling which model parameters are to be set
#' at a fixed value (i.e., not estimated).}
#'
#' \item{drop}{If \code{\link[=control.ergm]{control$drop}=TRUE}, a numeric vector indicating which terms were dropped due to to extreme values of the
#' corresponding statistics on the observed network, and how:
#' \describe{
#' \item{\code{0}}{The term was not dropped.}
#' \item{\code{-1}}{The term was at its minimum and the coefficient was fixed at
#' \code{-Inf}.}
#' \item{\code{+1}}{The term was at its maximum and the coefficient was fixed at
#' \code{+Inf}.}
#' }
#' }
#'
#' \item{estimable}{A logical vector indicating which terms could not be
#' estimated due to a \code{constraints} constraint fixing that term at a
#' constant value.
#' }
#'
#' \item{null.lik}{Log-likelihood of the null model. Valid only for
#' unconstrained models.}
#' \item{mle.lik}{The approximate log-likelihood for the MLE.
#' The value is only approximate because it is estimated based
#' on the MCMC random sample.}
#'
#' @section Notes on model specification:
#' Although each of the statistics in a given model is a summary
#' statistic for the entire network, it is rarely necessary to
#' calculate statistics for an entire network
#' in a proposed Metropolis-Hastings step.
#' Thus, for example, if the triangle term is included in the model,
#' a census of all triangles in the observed network is never
#' taken; instead, only the change in the number of triangles
#' is recorded for each edge toggle.
#'
#' In the implementation of \code{\link{ergm}}, the model is initialized
#' in \R, then all the model information is passed to a C program
#' that generates the sample of network statistics using MCMC.
#' This sample is then returned to \R, which implements a
#' simple Newton-Raphson algorithm to approximate the MLE.
#' An alternative style of maximum likelihood estimation is to use a stochastic
#' approximation algorithm. This can be chosen with the
#' \code{control.ergm(style="Robbins-Monro")} option.
#'
#' The mechanism for proposing new networks for the MCMC sampling
#' scheme, which is a Metropolis-Hastings algorithm, depends on
#' two things: The \code{constraints}, which define the set of possible
#' networks that could be proposed in a particular Markov chain step,
#' and the weights placed on these possible steps by the
#' proposal distribution. The former may be controlled using the
#' \code{constraints} argument described above. The latter may
#' be controlled using the \code{prop.weights} argument to the
#' \code{\link{control.ergm}} function.
#'
#' The package is designed so that the user could conceivably add additional
#' proposal types.
#'
#' @references
#' Admiraal R, Handcock MS (2007).
#' \pkg{networksis}: Simulate bipartite graphs with fixed
#' marginals through sequential importance sampling.
#' Statnet Project, Seattle, WA.
#' Version 1. \url{https://statnet.org}.
#'
#' Bender-deMoll S, Morris M, Moody J (2008).
#' Prototype Packages for Managing and Animating Longitudinal
#' Network Data: \pkg{dynamicnetwork} and \pkg{rSoNIA}.
#' \emph{Journal of Statistical Software}, 24(7).
#' \doi{10.18637/jss.v024.i07}
#'
#'
#' Butts CT (2007).
#' \pkg{sna}: Tools for Social Network Analysis.
#' R package version 2.3-2. \url{https://cran.r-project.org/package=sna}.
#'
#' Butts CT (2008).
#' \pkg{network}: A Package for Managing Relational Data in \R.
#' \emph{Journal of Statistical Software}, 24(2).
#' \doi{10.18637/jss.v024.i02}
#'
#' Butts C (2015).
#' \pkg{network}: The Statnet Project (https://statnet.org). R package version 1.12.0, \url{https://cran.r-project.org/package=network}.
#'
#' Goodreau SM, Handcock MS, Hunter DR, Butts CT, Morris M (2008a).
#' A \pkg{statnet} Tutorial.
#' \emph{Journal of Statistical Software}, 24(8).
#' \doi{10.18637/jss.v024.i08}
#'
#' Goodreau SM, Kitts J, Morris M (2008b).
#' Birds of a Feather, or Friend of a Friend? Using Exponential
#' Random Graph Models to Investigate Adolescent Social Networks.
#' \emph{Demography}, 45, in press.
#'
#' Handcock, M. S. (2003)
#' \emph{Assessing Degeneracy in Statistical Models of Social Networks},
#' Working Paper \#39,
#' Center for Statistics and the Social Sciences,
#' University of Washington.
#' \url{https://csss.uw.edu/research/working-papers/assessing-degeneracy-statistical-models-social-networks}
#'
#' Handcock MS (2003b).
#' \pkg{degreenet}: Models for Skewed Count Distributions Relevant
#' to Networks.
#' Statnet Project, Seattle, WA.
#' Version 1.0, \url{https://statnet.org}.
#'
#' Handcock MS and Gile KJ (2010). Modeling Social Networks from Sampled Data. \emph{Annals of Applied Statistics}, 4(1), 5-25. \doi{10.1214/08-AOAS221}
#'
#' Handcock MS, Hunter DR, Butts CT, Goodreau SM, Morris M (2003a).
#' \pkg{ergm}: A Package to Fit, Simulate and Diagnose
#' Exponential-Family Models for Networks.
#' Statnet Project, Seattle, WA.
#' Version 2, \url{https://statnet.org}.
#'
#' Handcock MS, Hunter DR, Butts CT, Goodreau SM, Morris M (2003b).
#' \pkg{statnet}: Software Tools for the Statistical Modeling of
#' Network Data.
#' Statnet Project, Seattle, WA.
#' Version 2, \url{https://statnet.org}.
#'
#' Hunter, D. R. and Handcock, M. S. (2006)
#' \emph{Inference in curved exponential family models for networks},
#' Journal of Computational and Graphical Statistics.
#'
#' Hunter DR, Handcock MS, Butts CT, Goodreau SM, Morris M (2008b).
#' \pkg{ergm}: A Package to Fit, Simulate and Diagnose
#' Exponential-Family Models for Networks.
#' \emph{Journal of Statistical Software}, 24(3).
#' \doi{10.18637/jss.v024.i03}
#'
#' Karwa V, Krivitsky PN, and Slavkovi\'{c} AB (2017). Sharing Social Network
#' Data: Differentially Private Estimation of Exponential-Family Random
#' Graph Models. \emph{Journal of the Royal Statistical Society, Series
#' C}, 66(3):481--500. \doi{10.1111/rssc.12185}
#'
#' Krivitsky PN (2012). Exponential-Family Random Graph Models for Valued
#' Networks. \emph{Electronic Journal of Statistics}, 2012, 6,
#' 1100-1128. \doi{10.1214/12-EJS696}
#'
#' Morris M, Handcock MS, Hunter DR (2008).
#' Specification of Exponential-Family Random Graph Models:
#' Terms and Computational Aspects.
#' \emph{Journal of Statistical Software}, 24(4).
#' \doi{10.18637/jss.v024.i04}
#'
#' Snijders, T.A.B. (2002),
#' Markov Chain Monte Carlo Estimation of Exponential Random Graph Models.
#' Journal of Social Structure.
#' Available from
#' \url{https://www.cmu.edu/joss/content/articles/volume3/Snijders.pdf}.
#'
#' @seealso [`network`], [`%v%`], [`%n%`], [`ergmTerm`], [`ergmMPLE`],
#' [summary.ergm()]
#'
#' @examples
#' \donttest{
#' #
#' # load the Florentine marriage data matrix
#' #
#' data(flo)
#' #
#' # attach the sociomatrix for the Florentine marriage data
#' # This is not yet a network object.
#' #
#' flo
#' #
#' # Create a network object out of the adjacency matrix
#' #
#' flomarriage <- network(flo,directed=FALSE)
#' flomarriage
#' #
#' # print out the sociomatrix for the Florentine marriage data
#' #
#' flomarriage[,]
#' #
#' # create a vector indicating the wealth of each family (in thousands of lira)
#' # and add it as a covariate to the network object
#' #
#' flomarriage %v% "wealth" <- c(10,36,27,146,55,44,20,8,42,103,48,49,10,48,32,3)
#' flomarriage
#' #
#' # create a plot of the social network
#' #
#' plot(flomarriage)
#' #
#' # now make the vertex size proportional to their wealth
#' #
#' plot(flomarriage, vertex.cex=flomarriage %v% "wealth" / 20, main="Marriage Ties")
#' #
#' # Use 'data(package = "ergm")' to list the data sets in a
#' #
#' data(package="ergm")
#' #
#' # Load a network object of the Florentine data
#' #
#' data(florentine)
#' #
#' # Fit a model where the propensity to form ties between
#' # families depends on the absolute difference in wealth
#' #
#' gest <- ergm(flomarriage ~ edges + absdiff("wealth"))
#' summary(gest)
#' #
#' # add terms for the propensity to form 2-stars and triangles
#' # of families
#' #
#' gest <- ergm(flomarriage ~ kstar(1:2) + absdiff("wealth") + triangle)
#' summary(gest)
#'
#' # import synthetic network that looks like a molecule
#' data(molecule)
#' # Add a attribute to it to mimic the atomic type
#' molecule %v% "atomic type" <- c(1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3)
#' #
#' # create a plot of the social network
#' # colored by atomic type
#' #
#' plot(molecule, vertex.col="atomic type",vertex.cex=3)
#'
#' # measure tendency to match within each atomic type
#' gest <- ergm(molecule ~ edges + kstar(2) + triangle + nodematch("atomic type"))
#' summary(gest)
#'
#' # compare it to differential homophily by atomic type
#' gest <- ergm(molecule ~ edges + kstar(2) + triangle
#' + nodematch("atomic type",diff=TRUE))
#' summary(gest)
#' }
#' @keywords models
#' @aliases is.ergm ergm.object
#' @export
ergm <- function(formula, response=NULL,
reference=~Bernoulli,
constraints=~.,
obs.constraints=~.-observed,
offset.coef=NULL,
target.stats=NULL,
eval.loglik=getOption("ergm.eval.loglik"),
estimate=c("MLE", "MPLE", "CD"),
control=control.ergm(),
verbose=FALSE,..., basis=ergm.getnetwork(formula)) {
check.control.class("ergm", "ergm")
handle.control.toplevel("ergm", ...)
ergm_call <- match.call(ergm)
reference <- trim_env_const_formula(reference)
constraints <- trim_env_const_formula(constraints)
obs.constraints <- trim_env_const_formula(obs.constraints)
estimate <- match.arg(estimate)
if(estimate=="CD"){
control$init.method <- "CD"
eval.loglik <- FALSE
}
if(estimate=="MPLE"){
control$init.method <- "MPLE"
}
if(!is.null(control$seed)) set.seed(as.integer(control$seed))
if (verbose) message("Evaluating network in model.")
nw <- basis
ergm_preprocess_response(nw,response)
proposalclass <- "c"
if(!is(constraints, "ergm_proposal")){
# Handle the observation process and other "automatic" constraints.
tmp <- .handle.auto.constraints(nw, constraints, obs.constraints, target.stats)
nw <- tmp$nw
conterms.obs <- tmp$conterms.obs
conterms <- tmp$conterms
}else if(!is(obs.constraints, "ergm_proposal")){
# Handle the observation process and other "automatic" constraints.
tmp <- .handle.auto.constraints(nw, trim_env(~.), obs.constraints, target.stats)
nw <- tmp$nw
conterms.obs <- tmp$conterms.obs
conterms <- tmp$conterms
}
if(!is(constraints, "ergm_proposal")){
if (verbose) message("Initializing unconstrained Metropolis-Hastings proposal: ", appendLF=FALSE)
## FIXME: a more general framework is needed?
if(is.valued(nw) && reference==trim_env(~Bernoulli)){
warn(paste0("The default Bernoulli reference distribution operates in the binary (",sQuote("response=NULL"),") mode only. Did you specify the ",sQuote("reference")," argument?"))
}
proposal <- ergm_proposal(conterms, hints=control$MCMC.prop, weights=control$MCMC.prop.weights, control$MCMC.prop.args, nw, class=proposalclass,reference=reference, term.options=control$term.options)
}else proposal <- constraints
if (verbose) message(sQuote(paste0(proposal$pkgname,":MH_",proposal$name)),".")
if (verbose) message("Initializing model...")
model <- ergm_model(formula, nw, extra.aux=NVL3(proposal$auxiliaries,list(proposal=.)), term.options=control$term.options)
proposal$aux.slots <- model$slots.extra.aux$proposal
if (verbose) message("Model initialized.")
if(!is(obs.constraints, "ergm_proposal")){
if(!is.null(conterms.obs)){
if (verbose) message("Initializing constrained Metropolis-Hastings proposal: ", appendLF=FALSE)
proposal.obs <- ergm_proposal(conterms.obs, hints=control$obs.MCMC.prop, weights=control$obs.MCMC.prop.weights, control$obs.MCMC.prop.args, nw, class=proposalclass, reference=reference, term.options=control$term.options)
if (verbose) message(sQuote(paste0(proposal.obs$pkgname,":MH_",proposal.obs$name)), appendLF=FALSE)
if(!is.null(proposal.obs$auxiliaries)){
if(verbose) message(" (requests auxiliaries: updating model).")
model$obs.model <- c(model, ergm_model(trim_env(~.), nw, extra.aux=list(proposal=proposal.obs$auxiliaries), term.options=control$term.options))
proposal.obs$slots.extra.aux <- model$model.obs$slots.extra.aux$proposal
if(verbose) message("Model reinitialized.")
}else if(verbose) message(".")
}else proposal.obs <- NULL
}else proposal.obs <- obs.constraints
## Construct approximate response network if target.stats are given.
if(!is.null(target.stats)){
nw.stats <- summary(model, nw)[!model$etamap$offsetmap]
target.stats <- vector.namesmatch(target.stats, names(nw.stats))
target.stats <- na.omit(target.stats)
if(length(nw.stats)!=length(target.stats)){
stop("Incorrect length of the target.stats vector: should be ", length(nw.stats), " but is ",length(target.stats),". Note that offset() terms should *not* get target statistics.")
}
# no need to pass the offset term's init to SAN
san.control <- control$SAN
if(verbose) message("Constructing an approximate response network.")
## If target.stats are given, overwrite the given network and formula
## with SAN-ed network and formula.
if(control$SAN.maxit > 0){
TARGET_STATS<-san(formula, target.stats=target.stats,
reference=reference,
constraints=constraints,
control=san.control,
only.last=TRUE,
output="ergm_state",
verbose=verbose,
offset.coef=NVL(offset.coef,control$init[model$etamap$offsettheta]))
if(verbose) message("Finished SAN run.")
}else{
TARGET_STATS <- nw
}
# From this point on, target.stats has NAs corresponding to the
# offset terms.
#
# TODO: Only have target.stats contain non-offset terms'
# statistics, and have the rest of the code handle it
# intelligently.
target.stats <- .align.target.stats.offset(model, target.stats)
nw <- TARGET_STATS <- as.network(TARGET_STATS)
formula<-nonsimp_update.formula(formula,TARGET_STATS~., from.new="TARGET_STATS")
} else {
if (network.edgecount(nw) == 0) warning("Network is empty and no target stats are specified.")
}
# TODO: SAN has this information, so maybe we should grab it from there if SAN does get run.
nw.stats <- summary(model, nw, term.options=control$term.options)
# conddeg MPLE has been superceded, but let the user know:
if(!is.directed(nw) && ("degrees" %in% names(proposal$arguments$constraints) ||
all(c("b1degrees","b2degrees") %in% names(proposal$arguments$constraints)))) message("Note that degree-conditional MPLE has been removed in version 4.0, having been superceded by Contrastive Divergence.")
# The following kludge knocks out MPLE if the sample space
# constraints are not dyad-independent. For example, ~observed
# constraint is dyad-independent, while ~edges is not.
#
# TODO: Create a flexible and general framework to manage methods
# for obtaining initial values.
init.candidates <- proposal$reference$init_methods
if("MPLE" %in% init.candidates && !is.dyad.independent(proposal$arguments$constraints,
proposal.obs$arguments$constraints)){
init.candidates <- init.candidates[init.candidates!="MPLE"]
if(verbose) message("MPLE cannot be used for this constraint structure.")
}
control$init.method <- ERRVL(try(match.arg(control$init.method, init.candidates), silent=TRUE), {
message("Specified initial parameter method ", sQuote(control$init.method), " is not in the list of candidates. Use at your own risk.")
control$init.method
})
if(verbose) message(paste0("Using initial method '",control$init.method,"'."))
if(is.curved(model) ||
!all(model$etamap$mintheta==-Inf) ||
!all(model$etamap$maxtheta==+Inf)){ # Curved or constrained model: use ergm.logitreg() rather than glm().
control$MPLE.type <- "logitreg"
}
# If some control$init is specified...
if(!is.null(control$init)){
# Check length of control$init.
if (length(control$init)!=length(model$etamap$offsettheta)) {
if(verbose){
message("control$init =")
message_print(control$init)
message("number of statistics is ", nparam(model, canonical=TRUE), "")
}
stop(paste("Invalid starting parameter vector control$init:",
"wrong number of parameters."))
}
}else control$init <- rep(NA, length(model$etamap$offsettheta)) # Set the default value of control$init.
if(!is.null(offset.coef)){
# TODO: Names matching here?
if(length(control$init[model$etamap$offsettheta])!=length(offset.coef))
stop("Invalid offset parameter vector offset.coef: ",
"wrong number of parameters: expected ",
length(control$init[model$etamap$offsettheta]),
" got ",length(offset.coef),".")
control$init[model$etamap$offsettheta]<-offset.coef
}
# Make sure any offset elements are given in control$init.
if(any(is.na(control$init) & model$etamap$offsettheta)) stop("The model contains offset terms whose parameter values have not been specified:", paste.and(param_names(model)[is.na(control$init)&model$offsettheta]), ".", sep="")
# Check if any terms are constrained to a constant and issue a warning.
constrcheck <- ergm.checkconstraints.model(model, proposal, control$init)
model <- constrcheck$model; control$init <- constrcheck$init
# Check if any terms are at their extremes and handle them depending on control$drop.
extremecheck <- ergm.checkextreme.model(model=model, nw=nw, init=control$init, target.stats=target.stats, drop=control$drop)
model <- extremecheck$model; control$init <- extremecheck$init
# MPLE is not supported for valued ERGMs.
if(estimate=="MPLE"){
if(is.valued(nw)) stop("Maximum Pseudo-Likelihood (MPLE) estimation for valued ERGM terms is not implemented at this time. You may want to use Contrastive Divergence by passing estimate='CD' instead.")
if(!is.dyad.independent(proposal$arguments$constraints,
proposal.obs$arguments$constraints))
message("Maximum Pseudo-Likelihood (MPLE) estimation for ERGMs with dyad-dependent constraints is not implemented at this time. You may want to use Contrastive Divergence by passing estimate='CD' instead.")
}
if (verbose) { message("Fitting initial model.") }
MPLE.is.MLE <- (proposal$reference$name=="Bernoulli"
&& is.dyad.independent(model)
&& !control$force.main
&& is.dyad.independent(proposal$arguments$constraints,
proposal.obs$arguments$constraints))
# If all other criteria for MPLE=MLE are met, _and_ SAN network matches target.stats directly, we can get away with MPLE.
if (!is.null(target.stats) && !isTRUE(all.equal(target.stats[!is.na(target.stats)],nw.stats[!is.na(target.stats)]))) message("Unable to match target stats. Using MCMLE estimation.")
MCMCflag <- (estimate=="MLE" && (!MPLE.is.MLE
|| (!is.null(target.stats) && !isTRUE(all.equal(target.stats,nw.stats)))
)
|| control$force.main)
# Short-circuit the optimization if all terms are either offsets or dropped.
if(all(model$etamap$offsettheta)){
# Note that this cannot be overridden with control$force.main.
message("All terms are either offsets or extreme values. No optimization is performed.")
return(structure(list(coefficients=control$init,
call=ergm_call,
iterations=0,
loglikelihood=NA,
mle.lik=NULL,
gradient=rep(NA,length=length(control$init)),
failure=TRUE,
offset=model$etamap$offsettheta,
drop=if(control$drop) extremecheck$extremeval.theta,
estimable=constrcheck$estimable,
network=nw,
reference=reference,
newnetwork=nw,
formula=formula,
constrained=proposal$arguments$constraints,
constrained.obs=proposal.obs$arguments$constraints,
constraints=constraints,
target.stats=target.stats,
target.esteq=if(!is.null(target.stats)) ergm.estfun(rbind(target.stats), control$init, model),
estimate=estimate,
ergm_version=packageVersion("ergm"),
control=control
),
class="ergm"))
}
model$nw.stats <- nw.stats
model$target.stats <- target.stats
if(control$init.method=="CD") if(is.null(names(control$init)))
names(control$init) <- param_names(model, FALSE)
initialfit <- ergm.initialfit(init=control$init, initial.is.final=!MCMCflag,
formula=formula, nw=nw, reference=reference,
m=model, method=control$init.method,
MPLEtype=control$MPLE.type,
control=control,
proposal=proposal,
proposal.obs=proposal.obs,
verbose=if(MCMCflag) FALSE else verbose,
...)
switch(control$init.method,
MPLE = NVL3(initialfit$xmat.full, check_nonidentifiability(., coef(initialfit), model,
tol = control$MPLE.nonident.tol, type="covariates",
nonident_action = control$MPLE.nonident,
nonvar_action = control$MPLE.nonvar)),
CD = NVL3(initialfit$sample, check_nonidentifiability(as.matrix(.), coef(initialfit), model,
tol = control$MPLE.nonident.tol, type="statistics",
nonident_action = control$MPLE.nonident,
nonvar_action = control$MPLE.nonvar))
)
initialfit$xmat.full <- NULL # No longer needed but takes up space.
estimate.desc <- switch(estimate,
MPLE = if(MPLE.is.MLE) "Maximum Likelihood"
else "Maximum Pseudolikelihood",
CD = "Contrastive Divergence",
MLE = paste0(if(MCMCflag) # If not, it's just MLE.
switch(control$main.method,
MCMLE = "Monte Carlo ",
`Stochastic-Approximation`="Stochastic Approximation ",
`Robbins-Monro`="Robbins-Monro ",
`Stepping`="Hummel Stepping "),
"Maximum Likelihood"))
if (!MCMCflag){ # Just return initial (non-MLE) fit and exit.
message("Stopping at the initial estimate.")
initialfit$call <- ergm_call
initialfit$ergm_version <- packageVersion("ergm")
initialfit$offset <- model$etamap$offsettheta
initialfit$MPLE_is_MLE <- MPLE.is.MLE
initialfit$drop <- if(control$drop) extremecheck$extremeval.theta
initialfit$estimable <- constrcheck$estimable
initialfit$network <- nw
initialfit$reference <- reference
initialfit$newnetwork <- nw
initialfit$formula <- formula
initialfit$constrained <- proposal$arguments$constraints
initialfit$constrained.obs <- proposal.obs$arguments$constraints
initialfit$constraints <- constraints
initialfit$obs.constraints <- obs.constraints
initialfit$target.stats <- suppressWarnings(na.omit(model$target.stats))
initialfit$nw.stats <- model$nw.stats
initialfit$etamap <- model$etamap
initialfit$target.esteq <- suppressWarnings(na.omit(if(!is.null(model$target.stats)) ergm.estfun(rbind(model$target.stats), coef(initialfit), model)))
initialfit$estimate <- estimate
initialfit$estimate.desc <- estimate.desc
initialfit$control<-control
if(any(!model$etamap$offsettheta) && eval.loglik){
message("Evaluating log-likelihood at the estimate. ",appendLF=FALSE)
initialfit<-logLik(initialfit, add=TRUE, control=control$loglik, verbose=verbose)
message("")
}
return(initialfit)
}
# Otherwise, set up the main phase of estimation:
ergm.getCluster(control, max(verbose-1,0))
# Revise the initial value, if necessary:
init <- coef(initialfit)
init[is.na(init)] <- 0
names(init) <- param_names(model, FALSE)
mainfit <- switch(control$main.method,
"Robbins-Monro" = ergm.robmon(init, nw, model,
proposal=proposal, verbose=verbose, control=control),
"Stochastic-Approximation" = ergm.stocapprox(init, nw, model,
control=control, proposal=proposal,
verbose=verbose),
"Stepping" = ergm.stepping(init, nw, model, initialfit, constraints,
#nstats=nstats,
#approx=lognormapprox, filename.prefix=NULL,
#control=control.ergm(nsim1=100, nsim2=1000, gridsize=100), # simulation parameters
#plots=FALSE, # currently useless, but plots can be reimplemented
control=control,
proposal=proposal, proposal.obs=proposal.obs,
verbose=verbose,...),
"MCMLE" = ergm.MCMLE(init, nw,
model,
# no need to pass initialfit to MCMLE
initialfit=(initialfit<-NULL),
control=control, proposal=proposal,
proposal.obs=proposal.obs,
verbose=verbose,
...),
stop("Method ", control$main.method, " is not implemented.")
)
initialfit <- NULL
# done with main fit
mainfit$call <- ergm_call
mainfit$ergm_version <- packageVersion("ergm")
mainfit$MPLE_is_MLE <- MPLE.is.MLE
mainfit$formula <- formula
mainfit$target.stats <- suppressWarnings(na.omit(model$target.stats))
mainfit$nw.stats <- model$nw.stats
mainfit$target.esteq <- suppressWarnings(na.omit(if(!is.null(model$target.stats)) ergm.estfun(rbind(model$target.stats), coef(mainfit), model)))
mainfit$constrained <- proposal$arguments$constraints
mainfit$constrained.obs <- proposal.obs$arguments$constraints
mainfit$constraints <- constraints
mainfit$obs.constraints <- obs.constraints
# unless the main fitting algorithm passes back a modified control
NVL(mainfit$control) <- control
mainfit$reference<-reference
mainfit$estimate <- estimate
mainfit$estimate.desc <- estimate.desc
mainfit$offset <- model$etamap$offsettheta
mainfit$drop <- if(control$drop) extremecheck$extremeval.theta
mainfit$estimable <- constrcheck$estimable
mainfit$etamap <- model$etamap
if (!control$MCMC.return.stats)
mainfit$sample <- NULL
if(eval.loglik){
message("Evaluating log-likelihood at the estimate. ", appendLF=FALSE)
mainfit<-logLik(mainfit, add=TRUE, control=control$loglik, verbose=verbose)
}
if (MCMCflag) {
message(paste(strwrap("This model was fit using MCMC. To examine model diagnostics and check for degeneracy, use the mcmc.diagnostics() function."),collapse="\n"))
}
mainfit
}