In [21]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

import yaml
from yaml import CLoader, CDumper

import json

with open('./secrets.yaml', 'r') as f:
    secrets = yaml.load(f, Loader=CLoader)

In [22]:

# Print dict() Tree-Structure
# NOTE:
    # ctx: {True:list, False:dict}
def treeShow(data, ctx = True, indent = ''):
    typeFlag = (isinstance(data,list) << 0) + (isinstance(data,dict) << 1)

    if typeFlag == 3:
        print('FML')

    if typeFlag == 0 or len(data) == 0:
        if isinstance(data, str):
            data = f'"{data}"'
        print(data)
        return

    # if flag == 1:
    #     data = {pos:e for pos,e in enumerate(data)}

    keyPath = None
    ordFlag = False
    if typeFlag == 2:
        # Assumed meta-notation for ordering dict()
            # {"__ord__": {"path": list[key, ...], "flag": bool}
        ord = data.pop("__ord__", None)

        if not ord is None:
            keyPath = ord["path"]
            ordFlag = ord["flag"]
            assert isinstance(keyPath, list)

    items = list(data.items() if typeFlag==2 else enumerate(data))

    # Identify element:
    find = (
        lambda X, path, pos = 0: \
        X if pos == len(path) \
        else find(X[path[pos]],path, pos + 1)
    )

    # Order the items before printing:
    items.sort(
        key = lambda X: \
            X[0] if keyPath is None 
            else find(X[1], keyPath)
    )
    if ordFlag:
        items.reverse()

    if len(items) == 1 and ctx:
        k,v = items[0]
        print('─── ' + (f"{k} : " if typeFlag==2 else ""), end='')
        treeShow(v, typeFlag == 1, indent + '    ')
        return

    for i, (k, v) in enumerate(items):
        offset = ""
        
        if ctx:
            if i!=0:
                offset = indent

            if i == 0:
                offset += '┌── '
            elif i==len(data)-1:
                offset += '└── '
            else:
                offset += '├── '
        else:
            if i == 0:
                offset = '\n'
            offset += indent

            if i == len(data)-1:
                offset += '└── '
            else:
                offset += '├── '

        print(
            offset + 
            (f"""{
                '"' + k + '"' if isinstance(k, str) else k
            }: """ if typeFlag==2 else ""), 
            end=''
        )

        treeShow(
            v, 
            typeFlag == 1, 
            indent + 
            ('│   ' if i!=len(data)-1 else '    ')
        )

    return

In [23]:
# Connect to the ES node:
esParams = secrets['elasticsearch']
client = Elasticsearch(
  hosts= esParams['host_addresses'],
  basic_auth=(esParams['user'],esParams['password']),
  verify_certs=True, 
  ca_certs='./elasticsearch-8.14.3/config/certs/http_ca.crt',
  # api_key= esParams['api_key']
)

In [24]:
# Schema/Format:
fmt = {
    "properties": {
        "reviewerID": {
            # client.sql.query() Error: Field [reviewerID] of data type [text] cannot be used for grouping; 
            # No keyword/multi-field defined exact matches for [reviewerID];
            # "type": "text"
            "type": "keyword"
        },
        "asin": {
            "type": "keyword"
        },
        "reviewerName": {
            "type": "text"
        },
        "reviewText": {
            "type": "text"
        },
        "overall": {
            "type": "double"
        },
        "summary": {
            "type": "text"
        },
        "unixReviewTime": {
            "type": "date"
        },
        "reviewTime": {
            "type": "date"
        }
    }
}

In [25]:
index_config = {  
  # Static index configurations:
    "number_of_shards": 1, # Default 1
    # "number_of_routing_shards": ,
    "codec": "default",
    # "routing_partition_size": ,
    "soft_deletes" : {"enabled": True}, # Default: true
    "soft_deletes": { "retention_lease": {"period": "12h"} }, # Default: 12h
    "load_fixed_bitset_filters_eagerly": True, # Default: True
    # "shard": {"check_on_startup": ""}, # Either "true","false" or "checksum"
    # ...
  
  # Dynamic index configurations:
    # ...
}
indexName = "product_reviews"

# Delete Index if exists:
try:
  treeShow(
    dict(client.indices.delete(index=indexName))
  )
except Exception as e:
  print("Index deletion failed:",e)

# Create Index:
resp = client.indices.create(
  index=indexName,
  body={"mappings":fmt, "settings":{"index":index_config}}
)

treeShow(dict(resp))

─── acknowledged : True
┌── "acknowledged": True
├── "index": "product_reviews"
└── "shards_acknowledged": True


In [26]:
# Create an ingest pipeline to validate the data types:
pipeline = [
  {
    "convert": {
      "field": "overall",
      "type": "double",
      "ignore_missing": False,
      "on_failure": [
        {
          "set": {
            "field": "err.overall",
            "value": "{{{overall}}}"
          }
        },
        # NOTE: Setting a field to null/None is not explicitly supported
        # {
        #   "set": {
        #     "field": "overall",
        #     "value": None
        #   }
        # }
        {
          "remove": {
            "field": "overall"
          }
        }
      ]
    }
  },
  {
    "date": {
      "field": "reviewTime",
      "target_field": "reviewTime",
      "formats": ["MM dd[,] yyyy","MM d[,] yyyy","M d[,] yyyy","M dd[,] yyyy"],
      "timezone": "UTC",
      "output_format":"yyyy-MM-dd",      
      "on_failure": [
        {
          "set": {
            "field": "err.reviewTime",
            "value": "{{{reviewTime}}}"
          }
        },
        {
          "remove": {
            "field": "reviewTime"
          }
        }
      ]
    }
  },
  {
    "date": {
      "field": "unixReviewTime",
      "target_field": "unixReviewTime",
      "formats": ["UNIX_MS"],
      "timezone": "UTC",
      "output_format":"epoch_millis",
      "on_failure": [
        {
          "set": {
            "field": "err.unixReviewTime",
            "value": "{{{unixReviewTime}}}"
          }
        },
        {
          "remove": {
            "field": "unixReviewTime"
          }
        }
      ]
    }
  }
]

# Update the mapping of the index for "err" field:
# https://www.elastic.co/guide/en/elasticsearch/reference/8.13/indices-put-mapping.html
resp = client.indices.put_mapping(
  index=indexName,
  body={
    "properties": {
      "err": {
        "type": "object",
        "dynamic": "true",

        # # SQL queries throw errors for dynamic fields if they do not exist.
        # # Q. Why not assume fields that do not exists are fields with value NULL ?! 
        # "properties": {field:{'type':'text'} for field in fmt['properties']}
      }
    }
  }
)

treeShow(dict(resp))

─── acknowledged : True


In [27]:
# Define the ingest pipeline:
pipelineName = f"typeCheck_{indexName}"
resp = client.ingest.put_pipeline(
  id=pipelineName,
  
  # The request-body, which may equivalently specify above arguments as parameters.
  # Q. How are conflicting parameters/arguments given priority? 
  # A. Raises ValueError, on conflict
  body={
    "description": "Validate Data Types of Raw Product Reviews Index data",
    "processors": pipeline
  }
)

treeShow(dict(resp))

─── acknowledged : True


In [28]:
# Update the index with default pipeline:
resp = client.indices.put_settings(
    index=indexName,
    body={"index": {"default_pipeline": pipelineName}}
)

treeShow(dict(resp))

─── acknowledged : True


In [29]:
# Load JSON data to local memory incrementally as a generator:
path = "q2_data.json"

# Bulk API calls are of the form: {op_type: { ... }}
# I assume the documents are parsed into above format by:
  # elasticsearch.helpers.actions.expand_action( ... )
# However, I was unable to find parsing of configurations 
# such as "list_executed_pipelines", "dynamic_templates", "timeout" and so on.
def load_data():
  global path, indexName
  
  def parse():
    for line in open(path,'r'):
      json_str = line.strip()
      if json_str != '':
        yield json.loads(json_str)

  # Parse file:
  docs = parse()
  
  # Add meta-data to document: 
  for i, doc in enumerate(docs):
    doc = {
      "_op_type": "index",

      # Pipeline for ingestion may be specified once
      # for all requests instead of specifying document-wise
      # "pipeline": "_none", # "_none" explicitly nullifies any default pipeline

      # Index to which document is stored:
      "_index": indexName,

      # "timeout" : "1m",
      "_id": f"{'_'.join(path.split('.'))}_{i}",
      "_source": doc,
    }
    yield doc

res = parallel_bulk(
  client=client,
  actions=load_data(),
  thread_count=8,
  queue_size=8,

  # Arguments for underlying 'elasticsearch.helpers.actions._process_bulk_chunk( ... )'
  ** {
    "raise_on_error":False,
  } 
)
res = [i for i in res]
print("Total document processed while ingesting/storing:",len(res))
print("Total errors encountered:",sum((0 if ok else 1) for ok,msg in res))

Total document processed while ingesting/storing: 34625
Total errors encountered: 0


In [30]:
# Iterate over the results and log the errors:
for flag, msg in res:
  if flag:
    continue
  print(treeShow(msg))

In [31]:
# Sample index response:
treeShow(res[0][1])

─── index : 
    ├── "_id": "q2_data_json_0"
    ├── "_index": "product_reviews"
    ├── "_primary_term": 1
    ├── "_seq_no": 0
    ├── "_shards": 
    │   ├── "failed": 0
    │   ├── "successful": 1
    │   └── "total": 2
    ├── "_version": 1
    ├── "result": "created"
    └── "status": 201


In [53]:
# NOTE: For all the Python client API methods,
# the arguments may be equivalently passed directly as
# JSON through the universal 'body' argument for all(almost?) methods

# Query the documents: (Using SQL)
# NOTE: https://www.elastic.co/guide/en/elasticsearch/reference/master/sql-limitations.html
exec_SQLQuery = lambda query, cluster: client.sql.query(
  # https://www.elastic.co/guide/en/elasticsearch/reference/master/sql-search-api.html
    # (Optional, string) Cursor used to retrieve a set of paginated results. 
    # If you specify a cursor, the API only uses the 'columnar' and 'time_zone' 
    # request body parameters. It ignores other request body parameters.
  # cursor=None, 
  # time_zone='Z', # Time zone ID for query. Default: 'Z' (UTC)

  catalog= cluster,

  # https://www.elastic.co/guide/en/elasticsearch/reference/8.14/sql-rest-format.html
  format="json",

  # https://www.elastic.co/guide/en/elasticsearch/reference/8.14/sql-rest-columnar.html:
    # Q. What does {columnar": True} mean for resp['body'] in different "format"s?
    # A. Formats that can be returned in columnar orientation: 'json', 'yaml', 'cbor' and 'smile'
  # columnar=True, # A more space efficient and compressible form of representation 
  
  error_trace=None, 
  fetch_size= 10**4, # The maximum number of rows (or entries) to return in one response
  field_multi_value_leniency=False, # Throw exception if field corresponds to multiple values 
  
  # ES Query DSL for additional filtering BEFORE execution of SQL query.
  # May be used to filter documents based on meta-data fields.
  # filter=None, 
  
  # filter_path=None, # ???
  # human=None, # ???

  # Deprecated !!! WTF ES!
  # index_using_frozen=False, # Enable executing queries on "frozen" Indices

  
  
  keep_alive="6h", # Default "5d", i.e. 5 days.
  keep_on_completion=None, 

  request_timeout=None, 
  wait_for_completion_timeout=None, 
  page_timeout=None, 

  # SQL query string that is translated and executed on Elastic Search:
  query=query, 
  
  # https://www.elastic.co/guide/en/elasticsearch/reference/master/sql-rest-params.html
    # List of values that are substituted dynamically into 
    # '?' marked SQL query string, in order.
  # Q. Why would any sane person writing Python, format query strings this way?
  # params=[],
  
  # pretty=None, # ???

  # Use Painless-scripts, to define fields that are computed at runtime of query,
  # derived from the entire context of the document:
  runtime_mappings=None, 

  # The request-body, which may equivalently specify above arguments as parameters.
  # Q. How are conflicting parameters/arguments given priority? (Answered above)
  # body=None
)

# Get cluster meta-data:
clusterMetaData = client.info()
clusterName = clusterMetaData['cluster_name']
print("Cluster Name:",clusterName)

Cluster Name: sounak


In [54]:
# Check for invalid fields in documents after data type matching:

# from elasticsearch.exceptions import BadRequestError

# Failed attempt:

# # BadRequestError(
# #   400, 
# #   'verification_exception', 
# #   'Found 1 problem\nline 3:5: [count(err.reviewTime)] cannot operate on field \
# #     of data type [text]: No keyword/multi-field defined exact matches for [reviewTime]; \
# #     define one or use MATCH/QUERY instead'
# # )
# field = "reviewTime"
# invalidRowsQuery = """
#   select
#     count(err.{field}) as errors
#   from
#     {indexName}
# """.format(indexName=indexName,field=field)
# treeShow(
#   dict(exec_SQLQuery(
#     invalidRowsQuery, 
#     clusterName
#   ))
# )  



In [55]:
# Question 1:
  # Total number of records

q1 = """
  select
    count(*) as total_reviews
  from
    {indexName}
  -- ; BadRequestError: BadRequestError(400, 'parsing_exception', "line 6:3: extraneous input ';' expecting <EOF>")
""".format(indexName=indexName)

treeShow(
  dict(exec_SQLQuery(q1, clusterName))
)

┌── "columns": 
│   └── ┌── "name": "total_reviews"
│       └── "type": "long"
└── "rows": 
    └── ─── 34625


In [56]:
# Question 2:
  # Top ten reviewers and their count of reviews

q2 = """
  select
    reviewerID,
    count(distinct asin) as count_of_reviews
  from
    {indexName}
  group by
    reviewerID
  order by
    count_of_reviews desc
  limit
    10
""".format(indexName=indexName)

treeShow(
  dict(exec_SQLQuery(q2, clusterName))
)

┌── "columns": 
│   ├── ┌── "name": "reviewerID"
│   │   └── "type": "keyword"
│   └── ┌── "name": "count_of_reviews"
│       └── "type": "long"
└── "rows": 
    ├── ┌── "A281NPSIMI1C2R"
    │   └── 19
    ├── ┌── "A1M04H40ZVGWVG"
    │   └── 9
    ├── ┌── "A3M174IC0VXOS2"
    │   └── 9
    ├── ┌── "A3KEZLJ59C1JVH"
    │   └── 8
    ├── ┌── "ALQGOMOY1F5X9"
    │   └── 8
    ├── ┌── "A17IS2KTMUBJ52"
    │   └── 7
    ├── ┌── "A2V5R832QCSOMX"
    │   └── 7
    ├── ┌── "A3LJLRIZL38GG3"
    │   └── 7
    ├── ┌── "A3R9H6OKZHHRJD"
    │   └── 7
    └── ┌── "AV6823XS14U41"
        └── 7


In [36]:
# # Question 3: (Falied attempt)

# # BadRequestError: 
# #   BadRequestError(
# #     400, 
# #     'verification_exception', 
# #     'Found 1 problem\nline 13:5: Nested aggregations in sub-selects are not supported.'
# #   )
q3 = """
  select
    asin,
    count(reviewerID) as reviews,
    avg(overall) as avg_rating
  from
    (select
      asin,
      reviewerID,
      avg(overall) as overall
    from
      {indexName}
    group by
      asin, reviewerID) as T
  group by
    asin
  order by
    reviews desc, avg_rating desc
  limit
    10
""".format(indexName=indexName)

In [37]:
# # Question 3: (Falied attempt)

# # BadRequestError: 
# #   BadRequestError(
# #     400, 
# #     'verification_exception', 
# #     'Found 1 problem\nline 18:5: Unknown index [T]'
# #   )
q3 = """
  with
  T as (
    select
      asin,
      reviewerID,
      avg(overall) as overall
    from
      {indexName}
    group by
      asin, reviewerID
  )
  select
    asin,
    count(reviewerID) as reviews,
    avg(overall) as avg_rating
  from
    T
  group by
    asin
  order by
    reviews desc, avg_rating desc
  limit
    10
""".format(indexName=indexName)

# # NOTE: https://www.elastic.co/guide/en/elasticsearch/reference/current/sql-limitations.html
# # treeShow(
# #   dict(exec_SQLQuery(q3, clusterName))
# # )

In [57]:
# Question 3:
  # Top ten products which had reviews, their count of reviews and average rating

# ES|QL attempt: (Success!)
query = """
  FROM {index} METADATA _index,_id,_version
  | STATS 
      avg_rating = AVG(overall)
    BY asin, reviewerID
  | STATS
    reviews = COUNT_DISTINCT(reviewerID, {precision})
    , avg_rating = AVG(avg_rating)
    BY asin
  | SORT reviews DESC NULLS LAST, avg_rating DESC NULLS LAST
  | LIMIT 10
""".format(
  index=indexName,
  precision=10**3
)

treeShow(
  dict(client.esql.query(
    query=query
  ))
)

┌── "columns": 
│   ├── ┌── "name": "reviews"
│   │   └── "type": "long"
│   ├── ┌── "name": "avg_rating"
│   │   └── "type": "double"
│   └── ┌── "name": "asin"
│       └── "type": "keyword"
└── "values": 
    ├── ┌── 792
    │   ├── 3.005050505050505
    │   └── "B0010AZ6Q2"
    ├── ┌── 642
    │   ├── 3.5171339563862927
    │   └── "B00113FENI"
    ├── ┌── 493
    │   ├── 3.8823529411764706
    │   └── "B0000A1WGK"
    ├── ┌── 406
    │   ├── 3.6798029556650245
    │   └── "B000IZ8KZ4"
    ├── ┌── 323
    │   ├── 3.777089783281734
    │   └── "B000LCETUO"
    ├── ┌── 317
    │   ├── 3.9400630914826498
    │   └── "B0007V6PK6"
    ├── ┌── 304
    │   ├── 3.611842105263158
    │   └── "B007M81A6G"
    ├── ┌── 302
    │   ├── 3.5927152317880795
    │   └── "B003E6VTW0"
    ├── ┌── 289
    │   ├── 4.26643598615917
    │   └── "B001800I44"
    └── ┌── 257
        ├── 4.214007782101167
        └── "B004R5E12M"
