/
gserialized_estimate.c
2571 lines (2216 loc) · 72.1 KB
/
gserialized_estimate.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**********************************************************************
*
* PostGIS - Spatial Types for PostgreSQL
* http://postgis.net
*
* PostGIS is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or
* (at your option) any later version.
*
* PostGIS is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with PostGIS. If not, see <http://www.gnu.org/licenses/>.
*
**********************************************************************
*
* Copyright 2012 (C) Paul Ramsey <pramsey@cleverelephant.ca>
*
**********************************************************************/
/**********************************************************************
THEORY OF OPERATION
The ANALYZE command hooks to a callback (gserialized_analyze_nd) that
calculates (compute_gserialized_stats_mode) two histograms of occurances of
features, once for the 2D domain (and the && operator) one for the
ND domain (and the &&& operator).
Queries in PostgreSQL call into the selectivity sub-system to find out
the relative effectiveness of different clauses in sub-setting
relations. Queries with constant arguments call gserialized_gist_sel,
queries with relations on both sides call gserialized_gist_joinsel.
gserialized_gist_sel sums up the values in the histogram that overlap
the contant search box.
gserialized_gist_joinsel sums up the product of the overlapping
cells in each relation's histogram.
Depending on the operator and type, the mode of selectivity calculation
will be 2D or ND.
- geometry && geometry ==> 2D
- geometry &&& geometry ==> ND
- geography && geography ==> ND
The 2D mode is put in effect by retrieving the 2D histogram from the
statistics cache and then allowing the generic ND calculations to
go to work.
TO DO: More testing and examination of the &&& operator and mixed
dimensionality cases. (2D geometry) &&& (3D column), etc.
**********************************************************************/
#include "postgres.h"
#include "access/genam.h"
#include "access/gin.h"
#include "access/gist.h"
#include "access/gist_private.h"
#include "access/gistscan.h"
#include "utils/datum.h"
#include "access/heapam.h"
#include "catalog/index.h"
#include "catalog/pg_am.h"
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "catalog/namespace.h"
#include "catalog/indexing.h"
#if PG_VERSION_NUM >= 100000
#include "utils/regproc.h"
#include "utils/varlena.h"
#endif
#include "utils/tqual.h"
#include "utils/builtins.h"
#include "utils/datum.h"
#include "utils/snapmgr.h"
#include "utils/fmgroids.h"
#include "funcapi.h"
#include "access/heapam.h"
#include "catalog/pg_type.h"
#include "access/relscan.h"
#include "executor/spi.h"
#include "fmgr.h"
#include "commands/vacuum.h"
#include "nodes/relation.h"
#include "parser/parsetree.h"
#include "utils/array.h"
#include "utils/lsyscache.h"
#include "utils/builtins.h"
#include "utils/syscache.h"
#include "utils/rel.h"
#include "utils/selfuncs.h"
#include "../postgis_config.h"
#if POSTGIS_PGSQL_VERSION >= 93
#include "access/htup_details.h"
#endif
#include "stringbuffer.h"
#include "liblwgeom.h"
#include "lwgeom_pg.h" /* For debugging macros. */
#include "gserialized_gist.h" /* For index common functions */
#include <math.h>
#if HAVE_IEEEFP_H
#include <ieeefp.h>
#endif
#include <float.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <ctype.h>
/************************************************************************/
/* Fall back to older finite() if necessary */
#ifndef HAVE_ISFINITE
# ifdef HAVE_GNU_ISFINITE
# define _GNU_SOURCE
# else
# define isfinite finite
# endif
#endif
/* Prototypes */
Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS);
Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS);
Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS);
Datum gserialized_gist_sel(PG_FUNCTION_ARGS);
Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS);
Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS);
Datum gserialized_analyze_nd(PG_FUNCTION_ARGS);
Datum gserialized_estimated_extent(PG_FUNCTION_ARGS);
Datum _postgis_gserialized_index_extent(PG_FUNCTION_ARGS);
Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS);
Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS);
Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS);
/* Local prototypes */
static Oid table_get_spatial_index(Oid tbl_oid, text *col, int *key_type);
static GBOX * spatial_index_read_extent(Oid idx_oid, int key_type);
/* Old Prototype */
Datum geometry_estimated_extent(PG_FUNCTION_ARGS);
/*
* Assign a number to the n-dimensional statistics kind
*
* tgl suggested:
*
* 1-100: reserved for assignment by the core Postgres project
* 100-199: reserved for assignment by PostGIS
* 200-9999: reserved for other globally-known stats kinds
* 10000-32767: reserved for private site-local use
*/
#define STATISTIC_KIND_ND 102
#define STATISTIC_KIND_2D 103
#define STATISTIC_SLOT_ND 0
#define STATISTIC_SLOT_2D 1
/*
* To look-up the spatial index associated with a table we
* need to find GIST indexes using our spatial keys.
*/
#define INDEX_KEY_ND "gidx"
#define INDEX_KEY_2D "box2df"
/*
* The SD factor restricts the side of the statistics histogram
* based on the standard deviation of the extent of the data.
* SDFACTOR is the number of standard deviations from the mean
* the histogram will extend.
*/
#define SDFACTOR 3.25
/**
* The maximum number of dimensions our code can handle.
* We'll use this to statically allocate a bunch of
* arrays below.
*/
#define ND_DIMS 4
/**
* Minimum width of a dimension that we'll bother trying to
* compute statistics on. Bearing in mind we have no control
* over units, but noting that for geographics, 10E-5 is in the
* range of meters, we go lower than that.
*/
#define MIN_DIMENSION_WIDTH 0.000000001
/**
* Default geometry selectivity factor
*/
#define DEFAULT_ND_SEL 0.0001
#define DEFAULT_ND_JOINSEL 0.001
/**
* More modest fallback selectivity factor
*/
#define FALLBACK_ND_SEL 0.2
#define FALLBACK_ND_JOINSEL 0.3
/**
* N-dimensional box type for calculations, to avoid doing
* explicit axis conversions from GBOX in all calculations
* at every step.
*/
typedef struct ND_BOX_T
{
float4 min[ND_DIMS];
float4 max[ND_DIMS];
} ND_BOX;
/**
* N-dimensional box index type
*/
typedef struct ND_IBOX_T
{
int min[ND_DIMS];
int max[ND_DIMS];
} ND_IBOX;
/**
* N-dimensional statistics structure. Well, actually
* four-dimensional, but set up to handle arbirary dimensions
* if necessary (really, we just want to get the 2,3,4-d cases
* into one shared piece of code).
*/
typedef struct ND_STATS_T
{
/* Dimensionality of the histogram. */
float4 ndims;
/* Size of n-d histogram in each dimension. */
float4 size[ND_DIMS];
/* Lower-left (min) and upper-right (max) spatial bounds of histogram. */
ND_BOX extent;
/* How many rows in the table itself? */
float4 table_features;
/* How many rows were in the sample that built this histogram? */
float4 sample_features;
/* How many not-Null/Empty features were in the sample? */
float4 not_null_features;
/* How many features actually got sampled in the histogram? */
float4 histogram_features;
/* How many cells in histogram? (sizex*sizey*sizez*sizem) */
float4 histogram_cells;
/* How many cells did those histogram features cover? */
/* Since we are pro-rating coverage, this number should */
/* now always equal histogram_features */
float4 cells_covered;
/* Variable length # of floats for histogram */
float4 value[1];
} ND_STATS;
/**
* Given that geodetic boxes are X/Y/Z regardless of the
* underlying geometry dimensionality and other boxes
* are guided by HAS_Z/HAS_M in their dimesionality,
* we have a little utility function to make it easy.
*/
static int
gbox_ndims(const GBOX* gbox)
{
int dims = 2;
if ( FLAGS_GET_GEODETIC(gbox->flags) )
return 3;
if ( FLAGS_GET_Z(gbox->flags) )
dims++;
if ( FLAGS_GET_M(gbox->flags) )
dims++;
return dims;
}
/**
* Utility function to see if the first
* letter of the mode argument is 'N'. Used
* by the _postgis_* functions.
*/
static int
text_p_get_mode(const text *txt)
{
int mode = 2;
if (VARSIZE(txt) - VARHDRSZ <= 0)
return mode;
char *modestr = (char*)VARDATA(txt);
if ( modestr[0] == 'N' )
mode = 0;
return mode;
}
/**
* Integer comparison function for qsort
*/
static int
cmp_int (const void *a, const void *b)
{
int ia = *((const int*)a);
int ib = *((const int*)b);
if ( ia == ib )
return 0;
else if ( ia > ib )
return 1;
else
return -1;
}
/**
* The difference between the fourth and first quintile values,
* the "inter-quintile range"
*/
static int
range_quintile(int *vals, int nvals)
{
qsort(vals, nvals, sizeof(int), cmp_int);
return vals[4*nvals/5] - vals[nvals/5];
}
/**
* Given double array, return sum of values.
*/
static double
total_double(const double *vals, int nvals)
{
int i;
float total = 0;
/* Calculate total */
for ( i = 0; i < nvals; i++ )
total += vals[i];
return total;
}
#if POSTGIS_DEBUG_LEVEL >= 3
/**
* Given int array, return sum of values.
*/
static int
total_int(const int *vals, int nvals)
{
int i;
int total = 0;
/* Calculate total */
for ( i = 0; i < nvals; i++ )
total += vals[i];
return total;
}
/**
* The average of an array of integers.
*/
static double
avg(const int *vals, int nvals)
{
int t = total_int(vals, nvals);
return (double)t / (double)nvals;
}
/**
* The standard deviation of an array of integers.
*/
static double
stddev(const int *vals, int nvals)
{
int i;
double sigma2 = 0;
double mean = avg(vals, nvals);
/* Calculate sigma2 */
for ( i = 0; i < nvals; i++ )
{
double v = (double)(vals[i]);
sigma2 += (mean - v) * (mean - v);
}
return sqrt(sigma2 / nvals);
}
#endif /* POSTGIS_DEBUG_LEVEL >= 3 */
/**
* Given a position in the n-d histogram (i,j,k) return the
* position in the 1-d values array.
*/
static int
nd_stats_value_index(const ND_STATS *stats, int *indexes)
{
int d;
int accum = 1, vdx = 0;
/* Calculate the index into the 1-d values array that the (i,j,k,l) */
/* n-d histogram coordinate implies. */
/* index = x + y * sizex + z * sizex * sizey + m * sizex * sizey * sizez */
for ( d = 0; d < (int)(stats->ndims); d++ )
{
int size = (int)(stats->size[d]);
if ( indexes[d] < 0 || indexes[d] >= size )
{
POSTGIS_DEBUGF(3, " bad index at (%d, %d)", indexes[0], indexes[1]);
return -1;
}
vdx += indexes[d] * accum;
accum *= size;
}
return vdx;
}
/**
* Convert an #ND_BOX to a JSON string for printing
*/
static char*
nd_box_to_json(const ND_BOX *nd_box, int ndims)
{
char *rv;
int i;
stringbuffer_t *sb = stringbuffer_create();
stringbuffer_append(sb, "{\"min\":[");
for ( i = 0; i < ndims; i++ )
{
if ( i ) stringbuffer_append(sb, ",");
stringbuffer_aprintf(sb, "%.6g", nd_box->min[i]);
}
stringbuffer_append(sb, "],\"max\":[");
for ( i = 0; i < ndims; i++ )
{
if ( i ) stringbuffer_append(sb, ",");
stringbuffer_aprintf(sb, "%.6g", nd_box->max[i]);
}
stringbuffer_append(sb, "]}");
rv = stringbuffer_getstringcopy(sb);
stringbuffer_destroy(sb);
return rv;
}
/**
* Convert an #ND_STATS to a JSON representation for
* external use.
*/
static char*
nd_stats_to_json(const ND_STATS *nd_stats)
{
char *json_extent, *str;
int d;
stringbuffer_t *sb = stringbuffer_create();
int ndims = (int)roundf(nd_stats->ndims);
stringbuffer_append(sb, "{");
stringbuffer_aprintf(sb, "\"ndims\":%d,", ndims);
/* Size */
stringbuffer_append(sb, "\"size\":[");
for ( d = 0; d < ndims; d++ )
{
if ( d ) stringbuffer_append(sb, ",");
stringbuffer_aprintf(sb, "%d", (int)roundf(nd_stats->size[d]));
}
stringbuffer_append(sb, "],");
/* Extent */
json_extent = nd_box_to_json(&(nd_stats->extent), ndims);
stringbuffer_aprintf(sb, "\"extent\":%s,", json_extent);
pfree(json_extent);
stringbuffer_aprintf(sb, "\"table_features\":%d,", (int)roundf(nd_stats->table_features));
stringbuffer_aprintf(sb, "\"sample_features\":%d,", (int)roundf(nd_stats->sample_features));
stringbuffer_aprintf(sb, "\"not_null_features\":%d,", (int)roundf(nd_stats->not_null_features));
stringbuffer_aprintf(sb, "\"histogram_features\":%d,", (int)roundf(nd_stats->histogram_features));
stringbuffer_aprintf(sb, "\"histogram_cells\":%d,", (int)roundf(nd_stats->histogram_cells));
stringbuffer_aprintf(sb, "\"cells_covered\":%d", (int)roundf(nd_stats->cells_covered));
stringbuffer_append(sb, "}");
str = stringbuffer_getstringcopy(sb);
stringbuffer_destroy(sb);
return str;
}
/**
* Create a printable view of the #ND_STATS histogram.
* Caller is responsible for freeing.
* Currently only prints first two dimensions.
*/
// static char*
// nd_stats_to_grid(const ND_STATS *stats)
// {
// char *rv;
// int j, k;
// int sizex = (int)roundf(stats->size[0]);
// int sizey = (int)roundf(stats->size[1]);
// stringbuffer_t *sb = stringbuffer_create();
//
// for ( k = 0; k < sizey; k++ )
// {
// for ( j = 0; j < sizex; j++ )
// {
// stringbuffer_aprintf(sb, "%3d ", (int)roundf(stats->value[j + k*sizex]));
// }
// stringbuffer_append(sb, "\n");
// }
//
// rv = stringbuffer_getstringcopy(sb);
// stringbuffer_destroy(sb);
// return rv;
// }
/** Expand the bounds of target to include source */
static int
nd_box_merge(const ND_BOX *source, ND_BOX *target)
{
int d;
for ( d = 0; d < ND_DIMS; d++ )
{
target->min[d] = Min(target->min[d], source->min[d]);
target->max[d] = Max(target->max[d], source->max[d]);
}
return true;
}
/** Zero out an ND_BOX */
static int
nd_box_init(ND_BOX *a)
{
memset(a, 0, sizeof(ND_BOX));
return true;
}
/**
* Prepare an ND_BOX for bounds calculation:
* set the maxes to the smallest thing possible and
* the mins to the largest.
*/
static int
nd_box_init_bounds(ND_BOX *a)
{
int d;
for ( d = 0; d < ND_DIMS; d++ )
{
a->min[d] = FLT_MAX;
a->max[d] = -1 * FLT_MAX;
}
return true;
}
/** Set the values of an #ND_BOX from a #GBOX */
static void
nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
{
int d = 0;
POSTGIS_DEBUGF(3, " %s", gbox_to_string(gbox));
nd_box_init(nd_box);
nd_box->min[d] = gbox->xmin;
nd_box->max[d] = gbox->xmax;
d++;
nd_box->min[d] = gbox->ymin;
nd_box->max[d] = gbox->ymax;
d++;
if ( FLAGS_GET_GEODETIC(gbox->flags) )
{
nd_box->min[d] = gbox->zmin;
nd_box->max[d] = gbox->zmax;
return;
}
if ( FLAGS_GET_Z(gbox->flags) )
{
nd_box->min[d] = gbox->zmin;
nd_box->max[d] = gbox->zmax;
d++;
}
if ( FLAGS_GET_M(gbox->flags) )
{
nd_box->min[d] = gbox->mmin;
nd_box->max[d] = gbox->mmax;
d++;
}
return;
}
/**
* Return true if #ND_BOX a overlaps b, false otherwise.
*/
static int
nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
{
int d;
for ( d = 0; d < ndims; d++ )
{
if ( (a->min[d] > b->max[d]) || (a->max[d] < b->min[d]) )
return false;
}
return true;
}
/**
* Return true if #ND_BOX a contains b, false otherwise.
*/
static int
nd_box_contains(const ND_BOX *a, const ND_BOX *b, int ndims)
{
int d;
for ( d = 0; d < ndims; d++ )
{
if ( ! ((a->min[d] < b->min[d]) && (a->max[d] > b->max[d])) )
return false;
}
return true;
}
/**
* Expand an #ND_BOX ever so slightly. Expand parameter is the proportion
* of total width to add.
*/
static int
nd_box_expand(ND_BOX *nd_box, double expansion_factor)
{
int d;
double size;
for ( d = 0; d < ND_DIMS; d++ )
{
size = nd_box->max[d] - nd_box->min[d];
if ( size <= 0 ) continue;
nd_box->min[d] -= size * expansion_factor / 2;
nd_box->max[d] += size * expansion_factor / 2;
}
return true;
}
/**
* What stats cells overlap with this ND_BOX? Put the lowest cell
* addresses in ND_IBOX->min and the highest in ND_IBOX->max
*/
static inline int
nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
{
int d;
POSTGIS_DEBUGF(4, " nd_box: %s", nd_box_to_json(nd_box, nd_stats->ndims));
/* Initialize ibox */
memset(nd_ibox, 0, sizeof(ND_IBOX));
/* In each dimension... */
for ( d = 0; d < nd_stats->ndims; d++ )
{
double smin = nd_stats->extent.min[d];
double smax = nd_stats->extent.max[d];
double width = smax - smin;
int size = roundf(nd_stats->size[d]);
/* ... find cells the box overlaps with in this dimension */
nd_ibox->min[d] = floor(size * (nd_box->min[d] - smin) / width);
nd_ibox->max[d] = floor(size * (nd_box->max[d] - smin) / width);
POSTGIS_DEBUGF(5, " stats: dim %d: min %g: max %g: width %g", d, smin, smax, width);
POSTGIS_DEBUGF(5, " overlap: dim %d: (%d, %d)", d, nd_ibox->min[d], nd_ibox->max[d]);
/* Push any out-of range values into range */
nd_ibox->min[d] = Max(nd_ibox->min[d], 0);
nd_ibox->max[d] = Min(nd_ibox->max[d], size-1);
}
return true;
}
/**
* Returns the proportion of b2 that is covered by b1.
*/
static inline double
nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
{
int d;
bool covered = true;
double ivol = 1.0;
double vol2 = 1.0;
double vol1 = 1.0;
for ( d = 0 ; d < ndims; d++ )
{
if ( b1->max[d] <= b2->min[d] || b1->min[d] >= b2->max[d] )
return 0.0; /* Disjoint */
if ( b1->min[d] > b2->min[d] || b1->max[d] < b2->max[d] )
covered = false;
}
if ( covered )
return 1.0;
for ( d = 0; d < ndims; d++ )
{
double width1 = b1->max[d] - b1->min[d];
double width2 = b2->max[d] - b2->min[d];
double imin, imax, iwidth;
vol1 *= width1;
vol2 *= width2;
imin = Max(b1->min[d], b2->min[d]);
imax = Min(b1->max[d], b2->max[d]);
iwidth = imax - imin;
iwidth = Max(0.0, iwidth);
ivol *= iwidth;
}
if ( vol2 == 0.0 )
return vol2;
return ivol / vol2;
}
/**
* Calculate how much a set of boxes is homogenously distributed
* or contentrated within one dimension, returning the range_quintile of
* of the overlap counts per cell in a uniform
* partition of the extent of the dimension.
* A uniform distribution of counts will have a small range
* and will require few cells in a selectivity histogram.
* A diverse distribution of counts will have a larger range
* and require more cells in a selectivity histogram (to
* distinguish between areas of feature density and areas
* of feature sparseness. This measurement should help us
* identify cases like X/Y/Z data where there is lots of variability
* in density in X/Y (diversely in a multi-kilometer range) and far
* less in Z (in a few-hundred meter range).
*/
static int
nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
{
/* How many bins shall we use in figuring out the distribution? */
static int num_bins = 50;
int d, i, k, range;
int counts[num_bins];
double smin, smax; /* Spatial min, spatial max */
double swidth; /* Spatial width of dimension */
#if POSTGIS_DEBUG_LEVEL >= 3
double average, sdev, sdev_ratio;
#endif
int bmin, bmax; /* Bin min, bin max */
const ND_BOX *ndb;
/* For each dimension... */
for ( d = 0; d < ndims; d++ )
{
/* Initialize counts for this dimension */
memset(counts, 0, sizeof(int)*num_bins);
smin = extent->min[d];
smax = extent->max[d];
swidth = smax - smin;
/* Don't try and calculate distribution of overly narrow dimensions */
if ( swidth < MIN_DIMENSION_WIDTH )
{
distribution[d] = 0;
continue;
}
/* Sum up the overlaps of each feature with the dimensional bins */
for ( i = 0; i < num_boxes; i++ )
{
double minoffset, maxoffset;
/* Skip null entries */
ndb = nd_boxes[i];
if ( ! ndb ) continue;
/* Where does box fall relative to the working range */
minoffset = ndb->min[d] - smin;
maxoffset = ndb->max[d] - smin;
/* Skip boxes that our outside our working range */
if ( minoffset < 0 || minoffset > swidth ||
maxoffset < 0 || maxoffset > swidth )
{
continue;
}
/* What bins does this range correspond to? */
bmin = num_bins * (minoffset) / swidth;
bmax = num_bins * (maxoffset) / swidth;
POSTGIS_DEBUGF(4, " dimension %d, feature %d: bin %d to bin %d", d, i, bmin, bmax);
/* Increment the counts in all the bins this feature overlaps */
for ( k = bmin; k <= bmax; k++ )
{
counts[k] += 1;
}
}
/* How dispersed is the distribution of features across bins? */
range = range_quintile(counts, num_bins);
#if POSTGIS_DEBUG_LEVEL >= 3
average = avg(counts, num_bins);
sdev = stddev(counts, num_bins);
sdev_ratio = sdev/average;
POSTGIS_DEBUGF(3, " dimension %d: range = %d", d, range);
POSTGIS_DEBUGF(3, " dimension %d: average = %.6g", d, average);
POSTGIS_DEBUGF(3, " dimension %d: stddev = %.6g", d, sdev);
POSTGIS_DEBUGF(3, " dimension %d: stddev_ratio = %.6g", d, sdev_ratio);
#endif
distribution[d] = range;
}
return true;
}
/**
* Given an n-d index array (counter), and a domain to increment it
* in (ibox) increment it by one, unless it's already at the max of
* the domain, in which case return false.
*/
static inline int
nd_increment(ND_IBOX *ibox, int ndims, int *counter)
{
int d = 0;
while ( d < ndims )
{
if ( counter[d] < ibox->max[d] )
{
counter[d] += 1;
break;
}
counter[d] = ibox->min[d];
d++;
}
/* That's it, cannot increment any more! */
if ( d == ndims )
return false;
/* Increment complete! */
return true;
}
static ND_STATS*
pg_nd_stats_from_tuple(HeapTuple stats_tuple, int mode)
{
int stats_kind = STATISTIC_KIND_ND;
int rv;
ND_STATS *nd_stats;
/* If we're in 2D mode, set the kind appropriately */
if ( mode == 2 ) stats_kind = STATISTIC_KIND_2D;
/* Then read the geom status histogram from that */
#if POSTGIS_PGSQL_VERSION < 100
float4 *floatptr;
int nvalues;
rv = get_attstatsslot(stats_tuple, 0, 0, stats_kind, InvalidOid,
NULL, NULL, NULL, &floatptr, &nvalues);
if ( ! rv ) {
POSTGIS_DEBUGF(2,
"no slot of kind %d in stats tuple", stats_kind);
return NULL;
}
/* Clone the stats here so we can release the attstatsslot immediately */
nd_stats = palloc(sizeof(float) * nvalues);
memcpy(nd_stats, floatptr, sizeof(float) * nvalues);
/* Clean up */
free_attstatsslot(0, NULL, 0, floatptr, nvalues);
#else /* PostgreSQL 10 or higher */
AttStatsSlot sslot;
rv = get_attstatsslot(&sslot, stats_tuple, stats_kind, InvalidOid,
ATTSTATSSLOT_NUMBERS);
if ( ! rv ) {
POSTGIS_DEBUGF(2,
"no slot of kind %d in stats tuple", stats_kind);
return NULL;
}
/* Clone the stats here so we can release the attstatsslot immediately */
nd_stats = palloc(sizeof(float4) * sslot.nnumbers);
memcpy(nd_stats, sslot.numbers, sizeof(float4) * sslot.nnumbers);
free_attstatsslot(&sslot);
#endif
return nd_stats;
}
/**
* Pull the stats object from the PgSQL system catalogs. Used
* by the selectivity functions and the debugging functions.
*/
static ND_STATS*
pg_get_nd_stats(const Oid table_oid, AttrNumber att_num, int mode, bool only_parent)
{
HeapTuple stats_tuple = NULL;
ND_STATS *nd_stats;
/* First pull the stats tuple for the whole tree */
if ( ! only_parent )
{
POSTGIS_DEBUGF(2, "searching whole tree stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
stats_tuple = SearchSysCache3(STATRELATTINH, ObjectIdGetDatum(table_oid), Int16GetDatum(att_num), BoolGetDatum(true));
if ( stats_tuple )
POSTGIS_DEBUGF(2, "found whole tree stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
}
/* Fall-back to main table stats only, if not found for whole tree or explicitly ignored */
if ( only_parent || ! stats_tuple )
{
POSTGIS_DEBUGF(2, "searching parent table stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
stats_tuple = SearchSysCache3(STATRELATTINH, ObjectIdGetDatum(table_oid), Int16GetDatum(att_num), BoolGetDatum(false));
if ( stats_tuple )
POSTGIS_DEBUGF(2, "found parent table stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
}
if ( ! stats_tuple )
{
POSTGIS_DEBUGF(2, "stats for \"%s\" do not exist", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
return NULL;
}
nd_stats = pg_nd_stats_from_tuple(stats_tuple, mode);
ReleaseSysCache(stats_tuple);
if ( ! nd_stats )
{
POSTGIS_DEBUGF(2,
"histogram for attribute %d of table \"%s\" does not exist?",
att_num, get_rel_name(table_oid));
}
return nd_stats;
}
/**
* Pull the stats object from the PgSQL system catalogs. The
* debugging functions are taking human input (table names)
* and columns, so we have to look those up first.
* In case of parent tables whith INHERITS, when "only_parent"
* is true this function only searchs for stats in the parent
* table ignoring any statistic collected from the children.
*/
static ND_STATS*
pg_get_nd_stats_by_name(const Oid table_oid, const text *att_text, int mode, bool only_parent)
{
const char *att_name = text2cstring(att_text);
AttrNumber att_num;
/* We know the name? Look up the num */
if ( att_text )
{
/* Get the attribute number */
att_num = get_attnum(table_oid, att_name);
if ( ! att_num ) {
elog(ERROR, "attribute \"%s\" does not exist", att_name);
return NULL;
}
}
else
{
elog(ERROR, "attribute name is null");
return NULL;
}
return pg_get_nd_stats(table_oid, att_num, mode, only_parent);
}