In [69]:
from flask import current_app as app
from opensearchpy.client import OpenSearch
from opensearchpy.helpers.search import Search
from pprint import pprint

client = OpenSearch(
    hosts=[{"host": "localhost", "port": 9200}],
    http_auth=("admin", "admin"),
    scheme="http",
)
search = Search(
    using=client,
    index="events-stats-record-view",
)
search._params["size"] = 10000
search = search.filter("term", country="imported").filter("term", recid="hse7q-t5221")
# search = search.filter("term", unique_session_id="0a0c3bb63269a28073c7d5e9265040f906121264526599c4a1f60e6b")
# search = search.filter("range", timestamp={"gte": "2024-01-01T00:00:00", "lte": "2025-01-01T00:00:00"})
terms = search.aggs.bucket("terms", "terms", field="unique_id")
terms.metric(
    "top_hit", "top_hits", size=10, sort={"timestamp": "desc"}
)
terms.metric(
    "unique_count",
    "cardinality",
    field="unique_session_id",
    precision_threshold=2000,
)
# one bucket per document with all that document's events,
# each bucket has
# - "doc_count" with the total number of events
# - "unique_count" with the number of unique unique_session_id values

response = list(search.scan())
print(len(response))
ids = []
if len(response) > 0:
    pprint(response[0].to_dict())
for r in response:
    if r.to_dict()["unique_session_id"] not in ids:
        ids.append(r.to_dict()["unique_session_id"])
    else:
        print("DUPLICATE unique_session_id")
        print(r.to_dict()["unique_session_id"])
        print(r.to_dict())

response = search.execute()
print(response.hits.total.value)
pprint([{k: v for k, v in b.items() if k in ["key", "doc_count", "unique_count"]} for b in response.to_dict()['aggregations']['terms']['buckets']])

108
{'country': 'imported',
 'is_robot': False,
 'parent_recid': 'hwq2e-mwz96',
 'recid': 'hse7q-t5221',
 'timestamp': '2024-09-06T09:19:42',
 'unique_id': 'ui_hse7q-t5221',
 'unique_session_id': '4c36cfac62f8637aa556841d2f3d0276f1aa121eeb77751fce34168d',
 'updated_timestamp': '2024-09-18T22:52:24.638910',
 'via_api': False,
 'visitor_id': 'a93f5ad8f20015e667df5e3339dfd5477b70b164746135c054d516f7'}
108
[{'doc_count': 108, 'key': 'ui_hse7q-t5221', 'unique_count': {'value': 108}}]


In [70]:
from flask import current_app as app
from opensearchpy.client import OpenSearch
from opensearchpy.helpers.search import Search
from pprint import pprint

client = OpenSearch(
    hosts=[{"host": "localhost", "port": 9200}],
    http_auth=("admin", "admin"),
    scheme="http",
)
search = Search(
    using=client,
    index="stats-record-view",
)
search = search.filter("term", recid="hse7q-t5221")
print(search.count())
# search = search.filter("term", timestamp="2024-01-01T00:00:00")

search._params["size"] = 10000
response = search.execute()
print(response.hits.total.value)
print(sum([h["_source"]["count"] for h in response.to_dict()["hits"]["hits"]]))
print(sum([h["_source"]["unique_count"] for h in response.to_dict()["hits"]["hits"]]))
print([h for h in response.to_dict()["hits"]["hits"] if h["_source"]["unique_count"] != h["_source"]["count"]])
pprint(response.to_dict())


0
0
0
0
[]
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 45, 'total': 45},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 2}


In [60]:
from flask import current_app as app
from opensearchpy.client import OpenSearch
from opensearchpy.helpers.search import Search
from pprint import pprint

client = OpenSearch(
    hosts=[{"host": "localhost", "port": 9200}],
    http_auth=("admin", "admin"),
    scheme="http",
)
aggs_query = Search(
            using=client,
            index="stats-record-view",
        ).extra(_source=False)
aggs_query = aggs_query.filter("term", recid="b6td4-wpw6")
aggs_query = aggs_query.filter("range", timestamp={"gte": "2024-06-01T00:00:00", "lte": "2025-01-01T00:00:00"})

result = aggs_query.execute()
pprint(result.to_dict())

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 69, 'total': 69},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 8}
