-
Notifications
You must be signed in to change notification settings - Fork 1.2k
/
test_aggregate.py
214 lines (196 loc) · 11.8 KB
/
test_aggregate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# Copyright 2021-present ScyllaDB
#
# SPDX-License-Identifier: AGPL-3.0-or-later
#############################################################################
# Tests aggregation functions COUNT(), MIN(), MAX(), SUM()
#############################################################################
import pytest
import math
from decimal import Decimal
from util import new_test_table, unique_key_int, project, new_type
from cassandra.util import Date
@pytest.fixture(scope="module")
def table1(cql, test_keyspace):
with new_test_table(cql, test_keyspace, "p int, c int, v int, d double, dc decimal, PRIMARY KEY (p, c)") as table:
yield table
# When there is no row matching the selection, the count should be 0.
# First check a "=" expression matching no row:
def test_count_empty_eq(cql, table1):
p = unique_key_int()
assert [(0,)] == list(cql.execute(f"select count(*) from {table1} where p = {p}"))
# The aggregation "0" for no results is true for count(), but not all
# aggregators - min() returns null for no results, while sum() and
# avg() return 0 for no results (see discussion in issue #13027):
assert [(None,)] == list(cql.execute(f"select min(v) from {table1} where p = {p}"))
assert [(0,)] == list(cql.execute(f"select sum(v) from {table1} where p = {p}"))
assert [(0,)] == list(cql.execute(f"select avg(v) from {table1} where p = {p}"))
# Above in test_count_empty_eq() we tested that aggregates return some value -
# sometimes 0 and sometimes null - when aggregating no data. If we select
# not just the aggregate but also something else like p, the result is even
# more bizarre: we get a null for p. One might argue that if there are no
# rows we should have just got back nothing in return - just like Cassandra
# does for GROUP BY (see below tests for #12477), but Cassandra doesn't do
# this, and this test accepts Cassandra's (and Scylla's) behavior here as
# being correct.
# See issue #13027.
def test_count_and_p_empty_eq(cql, table1):
p = unique_key_int()
assert [(None,0)] == list(cql.execute(f"select p, count(*) from {table1} where p = {p}"))
assert [(None,None)] == list(cql.execute(f"select p, min(v) from {table1} where p = {p}"))
assert [(None,0)] == list(cql.execute(f"select p, sum(v) from {table1} where p = {p}"))
# The query "p = null" also matches no row so should have a count 0, but
# it's a special case where no partition belongs to the query range so the
# aggregator doesn't need to send the query to any node. This reproduces
# issue #12475.
# This test is scylla_only because Cassandra doesn't support "p = null".
# it does support "p IN ()" with the same effect, though, so that will
# be the next test.
def test_count_empty_eq_null(cql, table1, scylla_only):
assert [(0,)] == list(cql.execute(f"select count(*) from {table1} where p = null"))
# A more complex list of aggregators, some return zero and some null
# for an empty result set:
assert [(0,0,None,0)] == list(cql.execute(f"select count(*), count(v), min(v), sum(v) from {table1} where p = null"))
# Another special case of a query which matches no partition - an IN with
# an empty list. Reproduces #12475.
def test_count_empty_in(cql, table1):
assert [(0,)] == list(cql.execute(f"select count(*) from {table1} where p in ()"))
assert [(0,0,None,0)] == list(cql.execute(f"select count(*), count(v), min(v), sum(v) from {table1} where p in ()"))
# Simple test of counting the number of rows in a single partition
def test_count_in_partition(cql, table1):
p = unique_key_int()
stmt = cql.prepare(f"insert into {table1} (p, c, v) values (?, ?, ?)")
cql.execute(stmt, [p, 1, 1])
cql.execute(stmt, [p, 2, 2])
cql.execute(stmt, [p, 3, 3])
assert [(3,)] == list(cql.execute(f"select count(*) from {table1} where p = {p}"))
# Using count(v) instead of count(*) allows counting only rows with a non-NULL
# value in v
def test_count_specific_column(cql, test_keyspace, table1):
p = unique_key_int()
stmt = cql.prepare(f"insert into {table1} (p, c, v) values (?, ?, ?)")
cql.execute(stmt, [p, 1, 1])
cql.execute(stmt, [p, 2, 2])
cql.execute(stmt, [p, 3, 3])
cql.execute(stmt, [p, 4, None])
assert [(4,)] == list(cql.execute(f"select count(*) from {table1} where p = {p}"))
assert [(3,)] == list(cql.execute(f"select count(v) from {table1} where p = {p}"))
# Check with non-scalar types too, reproduces #14198
with new_type(cql, test_keyspace, '(i int, t text)') as udt:
with new_test_table(cql, test_keyspace, f'p int, c int, fli frozen<list<int>>, tup tuple<int, bigint>, udt {udt}, PRIMARY KEY (p, c)') as table2:
cql.execute(f'INSERT INTO {table2}(p, c, fli, tup, udt) VALUES({p}, 5, [1, 2], (3, 4), {{i: 3, t: \'text\'}})')
cql.execute(f'INSERT INTO {table2}(p, c) VALUES({p}, 6)')
assert [(1,)] == list(cql.execute(f"select count(fli) from {table2} where p = {p}"))
assert [(1,)] == list(cql.execute(f"select count(tup) from {table2} where p = {p}"))
assert [(1,)] == list(cql.execute(f"select count(udt) from {table2} where p = {p}"))
assert [(2,)] == list(cql.execute(f"select count(*) from {table2} where p = {p}"))
# COUNT can be combined with GROUP BY to count separately for each partition
# or row.
def test_count_and_group_by_row(cql, table1):
p = unique_key_int()
stmt = cql.prepare(f"insert into {table1} (p, c, v) values (?, ?, ?)")
cql.execute(stmt, [p, 1, 1])
cql.execute(stmt, [p, 2, 2])
cql.execute(stmt, [p, 3, 3])
cql.execute(stmt, [p, 4, None])
assert [(p, 1, 1), (p, 2, 1), (p, 3, 1), (p, 4, 0)] == list(cql.execute(f"select p, c, count(v) from {table1} where p = {p} group by p,c"))
def test_count_and_group_by_partition(cql, table1):
p1 = unique_key_int()
p2 = unique_key_int()
stmt = cql.prepare(f"insert into {table1} (p, c, v) values (?, ?, ?)")
cql.execute(stmt, [p1, 1, 1])
cql.execute(stmt, [p1, 2, 2])
cql.execute(stmt, [p2, 3, 3])
cql.execute(stmt, [p2, 4, None])
assert [(p1, 2), (p2, 1)] == list(cql.execute(f"select p, count(v) from {table1} where p in ({p1},{p2}) group by p"))
# In the above tests we looked for per-row or per-partition counts and got
# back more than one count. But if our query matches no row, we should get
# back no count.
@pytest.mark.xfail(reason="issue #12477")
def test_count_and_group_by_row_none(cql, table1):
p = unique_key_int()
assert [] == list(cql.execute(f"select p, c, count(v) from {table1} where p = {p} group by p,c"))
@pytest.mark.xfail(reason="issue #12477")
def test_count_and_group_by_partition_none(cql, table1):
p = unique_key_int()
assert [] == list(cql.execute(f"select p, count(v) from {table1} where p = {p} group by p"))
# Regression-test for #7729.
def test_timeuuid(cql, test_keyspace):
schema = "a int, b timeuuid, primary key (a,b)"
with new_test_table(cql, test_keyspace, schema) as table:
cql.execute(f'insert into {table} (a, b) values (0, 13814000-1dd2-11ff-8080-808080808080)')
cql.execute(f'insert into {table} (a, b) values (0, 6b1b3620-33fd-11eb-8080-808080808080)')
assert project('system_todate_system_min_b',
cql.execute(f'select todate(min(b)) from {table} where a = 0')) == [Date('2020-12-01')]
assert project('system_todate_system_max_b',
cql.execute(f'select todate(max(b)) from {table} where a = 0')) == [Date('2038-09-06')]
# Check floating-point aggregations with non-finite results - inf and nan.
# Reproduces issue #13551: sum of +inf and -inf produced an error instead
# of a NaN as in Cassandra (and in older Scylla).
def test_aggregation_inf_nan(cql, table1):
p = unique_key_int()
# Add a single infinity value, see we can read it back as infinity,
# and its sum() is also infinity of course.
cql.execute(f"insert into {table1} (p, c, d) values ({p}, 1, infinity)")
assert [(math.inf,)] == list(cql.execute(f"select d from {table1} where p = {p} and c = 1"))
assert [(math.inf,)] == list(cql.execute(f"select sum(d) from {table1} where p = {p}"))
# Add a second value, a negative infinity. See we can read it back, and
# now the sum of +Inf and -Inf should be a NaN.
# Note that we have to use isnan() to check that the result is a NaN,
# because equality check doesn't work on NaN:
cql.execute(f"insert into {table1} (p, c, d) values ({p}, 2, -infinity)")
assert [(-math.inf,)] == list(cql.execute(f"select d from {table1} where p = {p} and c = 2"))
assert math.isnan(list(cql.execute(f"select sum(d) from {table1} where p = {p}"))[0][0])
# When averaging "decimal" (arbitrary-precision floating point) values,
# which precision should the output use? The average of 0, 0, 1 is
# 0.3333333333..., but with how many threes?
# The correct answer isn't clear. As explained in #13601 and CASSANDRA-18470,
# both Scylla and Cassandra took the number of significant digits in the
# average from the number of significant digits in the sum. So for example,
# the average of 1 and 2 is 2 (in Scylla, or 1 in Cassandra), but the average
# of 1.0 and 2.0 is 1.5. Similarly the average of 1.1 and 1.2 is 1.2 (or 1.1),
# not 1.15. This is surprising.
#
# Given that there is no way for the user to configure the desired precision,
# I don't know what would be a "correct" solution. I just feel that the
# scenarios tested in this test - averaging 1 and 2 or 1.1 and 1.2 -
# don't do something that any reasonable user might expect, and will need
# to be fixed.
@pytest.mark.xfail(reason="issue #13601")
def test_avg_decimal(cql, table1, cassandra_bug):
p = unique_key_int()
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1.0)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2.0)")
assert [(Decimal('1.5'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
p = unique_key_int()
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2.0)")
assert [(Decimal('1.5'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
p = unique_key_int()
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1.0)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2)")
assert [(Decimal('1.5'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
p = unique_key_int()
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2)")
# Reproduces #13601: the average of 1 and 2 was rounded up to the
# integer 2 instead of returning 1.5:
assert [(Decimal('1.5'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
# Similarly, average of 1.1 and 1.2 was rounded up to 1.2 instead of
# returning 1.15.
p = unique_key_int()
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1.1)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 1.2)")
assert [(Decimal('1.15'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
# Another test for average of "decimal" values: the average of 1, 2, 2, 3.
# In this case the average is an integer, 2, but Cassandra doesn't even
# get this right (see CASSANDRA-18470) because instead of of keeping the
# integer sum and count separately, it tries to update the average and does
# it in a too-low precision. Scylla passes this test correctly.
def test_avg_decimal_2(cql, table1, cassandra_bug):
p = unique_key_int()
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 3, 2)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 4, 3)")
# Reproduces CASSANDRA-18470:
assert [(Decimal('2'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))