-
Notifications
You must be signed in to change notification settings - Fork 41
/
join.ado
669 lines (565 loc) · 18 KB
/
join.ado
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
*! version 2.48.0 29mar2021
program define join
// Parse --------------------------------------------------------------------
syntax ///
[anything] /// Variables that will be added (default is _all unless keepnone is used)
, ///
[from(string asis) into(string asis)] /// -using- dataset
[by(string)] /// Primary and foreign keys
[KEEP(string)] /// 1 master 2 using 3 match
[ASSERT(string)] /// 1 master 2 using 3 match
[GENerate(name) NOGENerate] /// _merge variable
[UNIQuemaster] /// Assert that -by- is an id in the master dataset
[noLabel] ///
[KEEPNone] ///
[noNOTEs] ///
[noREPort] ///
[Verbose] ///
[METHOD(string)] // empty, or hash0, hash1, etc.
* Parse details of using dataset
_assert (`"`from'"' != "") + (`"`into'"' != "") == 1, ///
msg("specify either from() or into()")
ParseUsing `from'`into' // Return -filename- and -if-
* Parse _merge indicator
_assert ("`generate'" != "") + ("`nogenerate'" != "") < 2, ///
msg("generate() and nogenerate are mutually exclusive")
if ("`nogenerate'" == "") {
if ("`generate'" == "") loc generate _merge
confirm new variable `generate'
}
else {
tempvar generate
}
* Parse booleans
loc is_from = (`"`from'"' != "")
loc uniquemaster = ("`uniquemaster'" != "")
loc label = ("`label'" == "")
loc notes = ("`notes'" == "")
loc report = ("`report'" == "")
loc verbose = ("`verbose'" != "")
* Parse keep() and assert() requirements
ParseMerge, keep(`keep') assert(`assert')
/* Return locals
keep_using: 1 if we will keep using-only obs
assert_not_using: 1 to check that there are no using-only obs.
keep_nums: {1, 3, 1 3} depending on whether we keep master/match
assert_nums: as above but to assert only these exist (besides using)
keep_words assert_words: as above but with words instead of nums
*/
* Parse -key- variables
ParseBy `is_from' `by' /// Return -master_keys- and -using_keys-
// Load using dataset -------------------------------------------------------
* Load -using- dataset
if (`is_from') {
preserve
if (substr("`filename'", -8, 8) == ".parquet") {
if ("`anything'" != "" | "`keepnone'" != "") {
loc vars "`using_keys' `anything' using"
}
cap noi parquet use `vars' "`filename'", clear
if (_rc != 0) {
di as err "Parquet reading failed"
di as err "Try reading parquet file directly to see full error:"
di as err " parquet use `vars' `filename'"
exit _rc
}
}
else {
use "`filename'", clear
unab using_keys : `using_keys' // continuation of ParseBy
}
if (`"`if'"' != "") qui keep `if'
loc cmd restore
}
else {
loc cmd `"qui use `if' using "`filename'", clear"'
}
if ("`anything'" != "" | "`keepnone'" != "") {
keep `using_keys' `anything'
}
else {
qui ds `using_keys', not
loc anything `r(varlist)'
}
unab anything : `anything', min(0)
// Join ---------------------------------------------------------------------
mata: join("`using_keys'", "`master_keys'", "`anything'", ///
`"`cmd'"', "`generate'", `uniquemaster', ///
`keep_using', `assert_not_using', ///
`label', `notes', ///
`verbose', "`method'")
// Apply requirements on _merge variable ------------------------------------
cap la def _merge ///
1 "master only (1)" 2 "using only (2)" 3 "matched (3)" /// Used
4 "missing updated (4)" 5 "nonmissing conflict (5)" // Unused
la val `generate' _merge
loc msg "merge: after merge, not all observations from <`assert_words'>"
if ("`assert_nums'" == "") _assert !inlist(`generate', 1, 3), msg("`msg'")
if ("`assert_nums'" == "1") _assert !inlist(`generate', 3), msg("`msg'")
if ("`assert_nums'" == "3") _assert !inlist(`generate', 1), msg("`msg'")
if ("`keep_nums'" == "") qui drop if inlist(`generate', 1, 3)
if ("`keep_nums'" == "1") qui drop if inlist(`generate', 3)
if ("`keep_nums'" == "3") qui drop if inlist(`generate', 1)
* Adding data should clear the sort order of the master dataset
if (`keep_using') {
ClearSortOrder
}
if (`report') {
Table `generate'
}
if ("`nogenerate'" != "") {
label drop _merge
}
end
program define ParseUsing
* SAMPLE INPUT: somefile.dta if foreign==true
gettoken filename if : 0,
c_local filename `"`filename'"'
loc if `if' // remove leading/trailing spaces
c_local if `"`if'"'
end
program define ParseMerge
syntax, [keep(string) assert(string)]
if ("`keep'" == "") loc keep "master match using"
if ("`assert'" == "") loc assert "master match using"
loc keep_using 0
loc assert_not_using 1
loc match_valid `""3", "match", "mat", "matc", "matches", "matched""'
foreach cat in keep assert {
loc nums
loc words
foreach word of local `cat' {
if ("`word'"=="1" | substr("`word'", 1, 3) == "mas") {
loc nums `nums' 1
loc words `words' master
}
else if ("`word'"=="2" | substr("`word'", 1, 2) == "us") {
if ("`cat'" == "keep") loc keep_using 1
if ("`cat'" == "assert") loc assert_not_using 0
}
else if (inlist("`word'", `match_valid')) {
loc nums `nums' 3
loc words `words' match
}
else {
di as error "invalid category: <`word'>"
error 117
}
}
loc words : list sort words
loc nums : list sort nums
if ("`cat'"=="assert" & !`assert_not_using') loc words `words' using
c_local `cat'_words `words'
c_local `cat'_nums `nums'
}
c_local keep_using `keep_using'
c_local assert_not_using `assert_not_using'
end
program define ParseBy
* SAMPLE INPUT: 1 turn trunk
* SAMPLE INPUT: 0 year=time country=cou
gettoken is_from 0 : 0 // 1 if used from() , 0 if used into()
assert inlist(`is_from', 0, 1)
while ("`0'" != "") {
gettoken right 0 : 0
gettoken left right : right, parse("=")
if ("`right'" != "") {
gettoken eqsign right : right, parse("=")
assert "`eqsign'" == "="
}
else {
loc right `left'
}
loc master_keys `master_keys' `left'
loc using_keys `using_keys' `right'
}
* Mata functions such as st_vartype() don't play well with abbreviations
if (`is_from') unab master_keys : `master_keys'
if (!`is_from') unab using_keys : `using_keys'
c_local master_keys `master_keys'
c_local using_keys `using_keys'
end
program define ClearSortOrder
* Andrew Maurer's trick to clear `: sortedby'
* copied from fsort.ado
* see https://github.com/sergiocorreia/ftools/issues/32
loc sortvar : sortedby
if ("`sortvar'" != "") {
loc sortvar : word 1 of `sortvar'
loc sortvar_type : type `sortvar'
loc sortvar_is_str = strpos("`sortvar_type'", "str") == 1
loc val = `sortvar'[1]
if (`sortvar_is_str') {
qui replace `sortvar' = cond(mi(`"`val'"'), ".", "") in 1
qui replace `sortvar' = `"`val'"' in 1
}
else {
qui replace `sortvar' = cond(mi(`val'), 0, .) in 1
qui replace `sortvar' = `val' in 1
}
assert "`: sortedby'" == ""
}
end
program define Table
syntax varname
* Initialize defaults
loc N 0
forval i = 1/3 {
loc m`i' 0
}
if (c(N)) {
tempname freqs values
tab `varlist', nolabel nofreq matcell(`freqs') matrow(`values')
loc N = rowsof(`freqs')
loc is_temp = substr("`varlist'", 1, 2) == "__"
}
* Fill actual values
forval i = 1/`N' {
loc j = `values'[`i', 1]
loc m`j' = `freqs'[`i', 1]
if (!`is_temp') loc v`j' "(`varlist'==`j')"
}
* This chunk is based on merge.ado
di
di as smcl as txt _col(5) "Result" _col(38) "# of obs."
di as smcl as txt _col(5) "{hline 41}"
di as smcl as txt _col(5) "not matched" ///
_col(30) as res %16.0fc (`m1'+`m2')
if (`m1'|`m2') {
di as smcl as txt _col(9) "from master" ///
_col(30) as res %16.0fc `m1' as txt " `v1'"
di as smcl as txt _col(9) "from using" ///
_col(30) as res %16.0fc `m2' as txt " `v2'"
di
}
di as smcl as txt _col(5) "matched" ///
_col(30) as res %16.0fc `m3' as txt " `v3'"
di as smcl as txt _col(5) "{hline 41}"
end
findfile "ftools.mata"
include "`r(fn)'"
mata:
mata set matastrict on
//mata set matalnum on
void join(`String' using_keys,
`String' master_keys,
`String' varlist,
`String' cmd,
`Varname' generate,
`Boolean' uniquemaster,
`Boolean' keep_using,
`Boolean' assert_not_using,
`Boolean' join_labels,
`Boolean' join_chars,
`Boolean' verbose,
`String' method)
{
`Varlist' pk_names, fk_names, varformats
`Varlist' varnames_num, varnames_str, deck
`Varlist' vartypes_num, vartypes_str
`Variables' pk, fk
`Integer' N, i, val, j, k
`Factor' F
`DataFrame' data_num, reshaped_num, data_str, reshaped_str
`Vector' index, range, mask
`Boolean' integers_only
`Boolean' has_using
`Varname' var
`String' msg
`StringVector' varlabels, varvaluelabels, pk_varvaluelabels
`Dict' label_values, label_text
`Vector' values
`StringVector' text
`String' label
`Integer' old_width, new_width
`Integer' num_chars
`StringMatrix' chars
`StringVector' charnames
`String' char_name, char_val
// Note:
// - On the -using- dataset the keys will be unique, hence why they are the PKs (primary keys)
// - On the -master- dataset we allow duplicates (unless -uniquemaster- is set), hence whey they are FKs (foreign keys)
// Using
pk_names = tokens(using_keys)
pk = __fload_data(pk_names)
N = rows(pk)
// Assert keys are unique IDs in using
integers_only = is_integers_only(pk_names, pk)
F = _factor(pk, integers_only, verbose, method, 0)
assert_is_id(F, using_keys, "using")
varnames_num = varnames_str = deck = tokens(varlist)
vartypes_num = vartypes_str = J(1, cols(deck), "")
varformats = J(1, cols(deck), "")
varlabels = J(1, cols(deck), "")
varvaluelabels = J(1, cols(deck), "")
pk_varvaluelabels = J(1, cols(pk_names), "")
label_values = asarray_create("string", 1)
label_text = asarray_create("string", 1)
text = ""
values = .
if (join_chars) {
num_chars = rows(st_dir("char", "_dta", "*"))
}
for (i=1; i<=cols(deck); i++) {
var = deck[i]
// Assert vars are not strings (could allow for it, but not useful)
if (st_isstrvar(var)) {
varnames_num[i] = ""
vartypes_str[i] = st_vartype(var)
}
else {
varnames_str[i] = ""
vartypes_num[i] = st_vartype(var)
}
// Add variable labels, value labels, and assignments
varformats[i] = st_varformat(var)
varlabels[i] = st_varlabel(var)
varvaluelabels[i] = label = st_varvaluelabel(var)
if (join_labels) {
if (label != "" ? st_vlexists(label) : 0) {
st_vlload(label, values, text)
asarray(label_values, label, values)
asarray(label_text, label, text)
}
}
if (join_chars) {
num_chars = num_chars + rows(st_dir("char", var, "*"))
}
}
// Save value labels from the by() variables
if (join_labels) {
for (i=1; i<=cols(pk_names); i++) {
var = pk_names[i]
label = st_varvaluelabel(var)
if (label != "" ? st_vlexists(label) : 0) {
pk_varvaluelabels[i] = label
st_vlload(label, values, text)
asarray(label_values, label, values)
asarray(label_text, label, text)
}
}
}
// Save chars
// Note: we are NOT saving chars from the by() variables
if (join_chars) {
chars = J(num_chars, 3, "")
j = 0
for (k=0; k<=cols(deck); k++) {
var = k ? deck[k] : "_dta"
charnames = st_dir("char", var, "*")
for (i=1 ; i<=rows(charnames); i++) {
++j
chars[j, 1] = var
chars[j, 2] = charnames[i]
chars[j, 3] = st_global(sprintf("%s[%s]", var, charnames[i]))
}
}
}
varnames_num = tokens(invtokens(varnames_num))
varnames_str = tokens(invtokens(varnames_str))
vartypes_num = tokens(invtokens(vartypes_num))
vartypes_str = tokens(invtokens(vartypes_str))
if (cols(varnames_num) > 0) {
data_num = st_data(., varnames_num) , J(st_nobs(), 1, 3) // _merge==3
}
else {
data_num = J(st_nobs(), 1, 3) // _merge==3
}
if (cols(varnames_str) > 0) {
data_str = st_sdata(., varnames_str)
}
// Master
stata(cmd) // load (either -restore- or -use-)
if (cmd != "restore") {
stata("unab master_keys : " + master_keys) // continuation of ParseBy
master_keys = st_local("master_keys")
}
// Check that variables don't exist yet
msg = "{err}merge: variable %s already exists in master dataset\n"
for (i=1; i<=cols(deck); i++) {
var = deck[i]
if (_st_varindex(var) != .) {
printf(msg, var)
exit(108)
}
}
if (verbose) printf("{txt}variables added: {res}%s{txt}\n", invtokens(deck))
fk_names = tokens(master_keys)
fk = __fload_data(fk_names)
if (integers_only) {
integers_only = is_integers_only(fk_names, fk)
}
if (verbose) {
printf("{txt}(integers only? {res}%s{txt})\n", integers_only ? "true" : "false")
}
F = _factor(pk \ fk, integers_only, verbose, method, 0)
// Fill -reshaped_num- matrix with data from -using-
// 1. Start with the matrix full of MVs, for levels that appear only in -master- (_merge==1)
reshaped_num = J(F.num_levels, cols(data_num)-1, .) , J(F.num_levels, 1, 1) // _merge==1
// 2. Get the levels that also appear in -using-
index = F.levels[| 1 \ N |] // Note that F.levels is unique in 1..N only because the keys are unique in -using-
// 3. Populate the rows that are also in -using- with the data from using
reshaped_num[index, .] = data_num
if (cols(varnames_str) > 0) {
reshaped_str = J(F.num_levels, cols(data_str), "")
reshaped_str[index, .] = data_str
}
// 4. Rearrange and optionally expand the matrix to conform to the -master- dataset
index = F.levels[| N+1 \ . |]
reshaped_num = reshaped_num[index , .]
if (cols(varnames_str) > 0) {
reshaped_str = reshaped_str[index , .]
}
index = . // conserve memory
assert(st_nobs() == rows(reshaped_num))
vartypes_num = vartypes_num, "byte"
varnames_num = varnames_num, generate
val = setbreakintr(0)
st_store(., st_addvar(vartypes_num, varnames_num, 1), reshaped_num)
if (cols(varnames_str) > 0) {
st_sstore(., st_addvar(vartypes_str, varnames_str, 1), reshaped_str)
}
reshaped_num = reshaped_str = . // conserve memory
(void) setbreakintr(val)
// Add labels of new variables
msg = "{err}(warning: value label %s already exists; values overwritten)"
for (i=1; i<=cols(deck); i++) {
var = deck[i]
// label variable <var> <text>
if (varlabels[i] != "") {
st_varlabel(var, varlabels[i])
}
st_varformat(var, varformats[i])
label = varvaluelabels[i]
if (label != "") {
// label values <varlist> <label>
st_varvaluelabel(var, label)
if (join_labels) {
// Warn if value label gets overwritten
if (st_vlexists(label)) {
printf(msg, label)
}
// label define <label> <#> <text> <...>
st_vlmodify(label,
asarray(label_values, label),
asarray(label_text, label))
}
}
}
// Add value labels of by variables
if (join_labels) {
for (i=1; i<=cols(pk_names); i++) {
var = pk_names[i]
label = pk_varvaluelabels[i]
if (label == "") continue // Continue if no value label
if (st_vlexists(label)) printf(msg, label) // Warn
st_vlmodify(label,
asarray(label_values, label),
asarray(label_text, label))
st_varvaluelabel(var, label)
}
}
// Add chars and notes
if (join_chars) {
for (i=1; i<=num_chars; i++) {
var = chars[i, 1]
char_name = chars[i, 2]
char_val = chars[i, 3]
if (anyof(("note0", "iis", "tis", "_TSpanel", "_TStvar", "_TSitrvl", "_TSdelta"), char_name)) {
continue
}
else if (strpos(char_name, "note")==1) {
stata(sprintf("note %s: %s", var, char_val))
}
else {
st_global(sprintf("%s[%s]", var, char_name), char_val)
}
}
}
// Add using-only data
// status_using = 1 (assert not) 2 (drop it) 3 (keep it)
if (keep_using | assert_not_using) {
mask = (F.counts[F.levels[| 1 \ N |]] :== 1)
has_using = any(mask)
if (assert_not_using & has_using) {
_error("merge found observations from using")
}
if (keep_using & has_using) {
// Store keys (numeric or string)
pk = select(pk, mask)
range = st_nobs() + 1 :: st_nobs() + rows(pk)
st_addobs(rows(pk))
if (eltype(pk)=="string") {
// We might need to recast the variables
for (i=1; i<=cols(pk); i++) {
new_width = max(strlen(pk[., i]))
old_width = strtoreal(substr(st_vartype(fk_names[i]), 4, .))
if (old_width < new_width) {
// Recast fails; perhaps there is a bug in recast.ado
// or a conflict with this program
// Thus, we resort to a hack
stata(sprintf("assert mi(%s) in -1", fk_names[i]))
stata(sprintf(`"replace %s = "%s" in -1"', fk_names[i], " " * new_width))
}
}
st_sstore(range, fk_names, pk)
}
else {
st_store(range, fk_names, pk)
}
// Store numeric vars
data_num = select(data_num, mask)
data_num[., cols(data_num)] = J(rows(data_num), 1, 2) // _merge==2
st_store(range, varnames_num, data_num)
// Store string vars
if (cols(varnames_str) > 0) {
data_str = select(data_str, mask)
st_sstore(range, varnames_str, data_str)
// Move _merge to the end
stata("order " + generate + ", last")
}
}
}
// Ensure that the keys are unique in master
// (This changes F so must be run at the end)
if (uniquemaster) {
F.drop_obs(1 :: N)
assert_is_id(F, master_keys, "master")
}
}
`Boolean' is_integers_only(`Varlist' vars, `DataFrame' data)
{
`Boolean' integers_only
`Integer' i
`String' type
// First look at the variable types
for (i = integers_only = 1; i <= cols(vars); i++) {
type = st_vartype(vars[i])
if (!anyof(("byte", "int", "long"), type)) {
integers_only = 0
break
}
}
// However, long IDs can be doubles.
// If there is only one ID and it's a double, verify if it's an integer
if (!integers_only & cols(vars)==1 & st_vartype(vars[1]) == "double") {
integers_only = !any(mod(data, 1)) // all(data :== floor(data))
}
return(integers_only)
}
void assert_is_id(`Factor' F, `String' keys, `String' dta)
{
`String' msg
`Boolean' plural
plural = length(tokens(keys)) > 1
msg = sprintf("variable%s %s do%s not uniquely identify observations in the %s data",
plural ? "s" : "", keys, plural ? "" : "es", dta)
if (!F.is_id()) {
// Graceful error handling, as in The Mata Book (Appendix A.4)
errprintf(msg)
exit(459)
// NotReached
}
}
end
exit