In [None]:
import dotenv
import importlib.util
import logging
import json
import os

import pathway as pw
from pathway.xpacks.llm import embedders, parsers, splitters, vector_store
from unstructured.chunking.title import chunk_by_title
# from unstructured.documents.elements import Title

In [None]:
# Make sure libmagic is available
LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))
assert LIBMAGIC_AVAILABLE

In [None]:
logger = logging.getLogger(__name__)

In [None]:
config = dotenv.dotenv_values('.env')

## Parameters

In [None]:
protocol_name = 'aave'
prop_order_by = 'asc'
api_key = config['BOARDROOM_API_KEY']

## Schemas

In [None]:
class BoardroomAPI(pw.Schema):
    data: pw.Json

## Parser Config

### Custom Parser UDF

In [None]:
from collections.abc import Callable
from io import BytesIO
from pathway.optional_import import optional_imports
# from typing import TYPE_CHECKING, Any, Literal

from typing import Any


In [None]:
class CustomParseUnstructured(pw.UDF):
    """
    Parse document using `https://unstructured.io/ <https://unstructured.io/>`_.

    All arguments can be overridden during UDF application.

    Args:
        - mode: single, elements or paged.
          When single, each document is parsed as one long text string.
          When elements, each document is split into unstructured's elements.
          When paged, each pages's text is separately extracted.
        - post_processors: list of callables that will be applied to all extracted texts.
        - **unstructured_kwargs: extra kwargs to be passed to unstructured.io's `partition` function
    """

    def __init__(
        self,
        mode: str = "single",
        post_processors: list[Callable] | None = None,
        **unstructured_kwargs: Any,
    ):
        with optional_imports("xpack-llm-docs"):
            import unstructured.partition.auto  # noqa:F401

        super().__init__()
        _valid_modes = {"single", "elements", "paged"}
        if mode not in _valid_modes:
            raise ValueError(
                f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
            )

        self.kwargs = dict(
            mode=mode,
            post_processors=post_processors or [],
            unstructured_kwargs=unstructured_kwargs,
        )

    # # `links` and `languages` in metadata are lists, so their content should be added.
    # # We don't want return `coordinates`, `parent_id` and `category_depth` - these are
    # # element specific (i.e. they can differ for elements on the same page)
    # def _combine_metadata(self, left: dict, right: dict) -> dict:
    #     result = {}
    #     links = left.pop("links", []) + right.pop("links", [])
    #     languages = list(set(left.pop("languages", []) + right.pop("languages", [])))
    #     result.update(left)
    #     result.update(right)
    #     result["links"] = links
    #     result["languages"] = languages
    #     result.pop("coordinates", None)
    #     result.pop("parent_id", None)
    #     result.pop("category_depth", None)
    #     return result

    # def __wrapped__(self, contents: bytes, **kwargs) -> list[tuple[str, dict]]:
    def __wrapped__(self, contents: bytes, **kwargs) -> list[dict]:
        """
        Parse the given document:

        Args:
            - contents: document contents
            - **kwargs: override for defaults set in the constructor

        Returns:
            a list of pairs: text chunk and metadata
            The metadata is obtained from Unstructured, you can check possible values
            in the `Unstructed documentation <https://unstructured-io.github.io/unstructured/metadata.html>`
            Note that when `mode` is set to `single` or `paged` some of these fields are
            removed if they are specific to a single element, e.g. `category_depth`.
        """
        import unstructured.partition.auto

        kwargs = {**self.kwargs, **kwargs}

        elements = unstructured.partition.auto.partition(
            file=BytesIO(contents), **kwargs.pop("unstructured_kwargs")
        )

        post_processors = kwargs.pop("post_processors")
        for element in elements:
            for post_processor in post_processors:
                element.apply(post_processor)

        mode = kwargs.pop("mode")

        if kwargs:
            raise ValueError(f"Unknown arguments: {', '.join(kwargs.keys())}")

        if mode == "elements":
            # docs: list[tuple[str, dict]] = list()
            # for element in elements:
            #     # NOTE(MthwRobinson) - the attribute check is for backward compatibility
            #     # with unstructured<0.4.9. The metadata attributed was added in 0.4.9.
            #     # if hasattr(element, "metadata"):
            #     #     metadata = element.metadata.to_dict()
            #     # else:
            #     #     metadata = {}
            #     # if hasattr(element, "category"):
            #     #     metadata["category"] = element.category
            #     # docs.append((str(element), metadata))
            docs: list[dict] = [el.to_dict() for el in elements]
        # elif mode == "paged":
        #     text_dict: dict[int, str] = {}
        #     meta_dict: dict[int, dict] = {}

        #     for idx, element in enumerate(elements):
        #         if hasattr(element, "metadata"):
        #             metadata = element.metadata.to_dict()
        #         else:
        #             metadata = {}
        #         page_number = metadata.get("page_number", 1)

        #         # Check if this page_number already exists in docs_dict
        #         if page_number not in text_dict:
        #             # If not, create new entry with initial text and metadata
        #             text_dict[page_number] = str(element) + "\n\n"
        #             meta_dict[page_number] = metadata
        #         else:
        #             # If exists, append to text and update the metadata
        #             text_dict[page_number] += str(element) + "\n\n"
        #             meta_dict[page_number] = self._combine_metadata(
        #                 meta_dict[page_number], metadata
        #             )

        #     # Convert the dict to a list of dicts representing documents
        #     docs = [(text_dict[key], meta_dict[key]) for key in text_dict.keys()]
        # elif mode == "single":
        #     metadata = {}
        #     for element in elements:
        #         if hasattr(element, "metadata"):
        #             metadata = self._combine_metadata(
        #                 metadata, element.metadata.to_dict()
        #             )
        #     text = "\n\n".join([str(el) for el in elements])
        #     docs = [(text, metadata)]
        else:
            raise ValueError(f"mode of {mode} not supported.")
        return docs

    def __call__(self, contents: pw.ColumnExpression, **kwargs) -> pw.ColumnExpression:
        """
        Parse the given document.

        Args:
            - contents: document contents
            - **kwargs: override for defaults set in the constructor

        Returns:
            A column with a list of pairs for each query. Each pair is a text chunk and
            associated metadata.
            The metadata is obtained from Unstructured, you can check possible values
            in the `Unstructed documentation <https://unstructured-io.github.io/unstructured/metadata.html>`
            Note that when `mode` is set to `single` or `paged` some of these fields are
            removed if they are specific to a single element, e.g. `category_depth`.
        """
        return super().__call__(contents, **kwargs)

In [None]:
# export CFLAGS="-Wno-nullability-completeness" if trying to install pillow-heif missingn module
# libmagic -> Required for having libmagic working:
# - brew install libmagic
# - pip install python-magic-bin

# parser = parsers.ParseUnstructured(mode="elements")
parser = CustomParseUnstructured(mode="elements") # TODO: do we need extra cleaning function as post_processors ?

In [None]:
# @pw.udf(executor=pw.udfs.async_executor())

@pw.udf
def filter_document(document: pw.Json, fields: list[str]) -> pw.Json:
    data = { **document.as_dict() }
    # data = { "refId": document["refId"] }
    for field in fields:
        if field in data:
            data.pop(field)
    return data

In [None]:
# u_logger = logging.getLogger("unstructured")
# u_logger.setLevel(logging.INFO)

## Protocol

In [None]:
class Protocol(pw.Schema):
    cname: str
    name: str
    categories: str
    is_enabled: bool
    active_on_website: bool
    total_proposals: int
    total_votes: int
    unique_voters: int
    # tokens: list[object]
    ptype: str
    # delegated_support: dict


In [None]:
def protocol_mapper(raw_data: bytes) -> bytes:
    # logger.info(raw_data.decode())
    data = json.loads(raw_data.decode())["data"]
    return json.dumps(
        {
            "cname": data["cname"],
            "name": data["name"],
            "categories": ",".join(data["categories"]),
            "is_enabled": data["isEnabled"],
            "active_on_website": data["activeOnWebsite"],
            "total_proposals": data["totalProposals"],
            "total_votes": data["totalVotes"],
            "unique_voters": data["uniqueVoters"],
            "ptype": data["type"],
        }
    ).encode()


In [None]:

protocol = pw.io.http.read(
    f"https://api.boardroom.info/v1/protocols/{protocol_name}?key={api_key}",
    method='GET',
    headers={"Accept": "application/json"},
    # format="raw",
    schema=Protocol,
    response_mapper=protocol_mapper
)

In [None]:
protocol.schema

In [None]:
protocol

## Proposals

### Old schemas

In [None]:
# class Proposal(pw.Schema):
#     protocol: str
#     ref_id: str
#     pid: str
#     title: str
#     content: str
#     adapter: str
#     proposer: str
#     total_votes: int
#     block_number: int
#     external_url: str
#     start_timestamp: int
#     end_timestamp: int
#     current_state: str
#     # results: list[dict] # TODO
#     ptype: str
#     summary: str
#     privacy: str
#     # executables: dict
#     tx_hash: str
#     quorum: int



In [None]:
# def coalesce(data: dict, key: str, fallback: str = ''):
#     return data[key] if key in data else fallback

# def proposal_mapper(raw_data: bytes) -> bytes:
#     json_data = json.loads(raw_data.decode())
#     data_records = json_data["data"]
#     next_cursor = json_data["nextCursor"]
#     # return json.dumps([
#     #     {
#     #         "protocol": coalesce(data, "protocol"),
#     #         "ref_id": coalesce(data, "refId"),
#     #         "pid": coalesce(data, "id"),
#     #         "title": coalesce(data, "title"),
#     #         "content": coalesce(data, "content"),
#     #         "adapter": coalesce(data, "adapter"),
#     #         "proposer": coalesce(data, "proposer"),
#     #         "total_votes": coalesce(data, "totalVotes"),
#     #         "block_number": coalesce(data, "blockNumber"),
#     #         "external_url": coalesce(data, "externalUrl"),
#     #         "start_timestamp": int(coalesce(data, "startTimestamp")),
#     #         "end_timestamp": int(coalesce(data, "endTimestamp")),
#     #         "current_state": coalesce(data, "currentState"),
#     #         # "results": [{"choice": data["choices"][choice["choice"]], "result": choice["total"]} for choice in data["results"]],
#     #         "ptype": coalesce(data, "type"),
#     #         "summary": coalesce(data, "summary"),
#     #         "privacy": coalesce(data, "privacy"),
#     #         "tx_hash": coalesce(data, "txHash"),
#     #         "quorum": coalesce(data, "quorum"),
#     #     } for data in data_records
#     data = data_records[0]
#     return json.dumps(
#         {
#             "protocol": coalesce(data, "protocol"),
#             "ref_id": coalesce(data, "refId"),
#             "pid": coalesce(data, "id"),
#             "title": coalesce(data, "title"),
#             "content": coalesce(data, "content"),
#             "adapter": coalesce(data, "adapter"),
#             "proposer": coalesce(data, "proposer"),
#             "total_votes": coalesce(data, "totalVotes"),
#             "block_number": coalesce(data, "blockNumber"),
#             "external_url": coalesce(data, "externalUrl"),
#             "start_timestamp": int(coalesce(data, "startTimestamp")),
#             "end_timestamp": int(coalesce(data, "endTimestamp")),
#             "current_state": coalesce(data, "currentState"),
#             # "results": [{"choice": data["choices"][choice["choice"]], "result": choice["total"]} for choice in data["results"]],
#             "ptype": coalesce(data, "type"),
#             "summary": coalesce(data, "summary"),
#             "privacy": coalesce(data, "privacy"),
#             "tx_hash": coalesce(data, "txHash"),
#             "quorum": coalesce(data, "quorum"),
#         }
#     ).encode()

### Code

In [None]:
@pw.udf
def append_parent_id(content: pw.Json, parent_id: str) -> pw.Json:
    data = { "parent_id": parent_id, **content.as_dict() }
    return data

In [None]:
proposals = None

In [None]:
proposals = pw.io.http.read(
    f"https://api.boardroom.info/v1/protocols/{protocol_name}/proposals?key={api_key}&orderByIndexedAt{prop_order_by}",
    method='GET',
    headers={"Accept": "application/json"},
    format="json",
    schema=BoardroomAPI
    # schema=Proposal,
    # response_mapper=proposal_mapper
)
proposals = proposals.flatten(proposals.data)

In [None]:
proposals = proposals.with_columns(
    refId=pw.this.data.get("refId", default=pw.Json("")).as_str(),
    title=pw.this.data.get("title", default=pw.Json("")).as_str(),
    # metadata=pw.apply_with_type(lambda x: filter_document(x, ["refId", "title", "content"]), dict, pw.this.data),
    # metadata=pw.apply_with_type(lambda x: filter_document(x), dict, pw.this.data),
    metadata=filter_document(pw.this.data, ["refId", "title", "content"]),
)

In [None]:
proposals.schema

In [None]:
proposals_table = proposals.select(
    element_id=pw.this.refId,
    text=pw.this.title,
    metadata=pw.this.metadata,
    type="Title"
    # content=build_main_element(pw.this.refId, pw.this.title, pw.this.metadata),
)

In [None]:
proposals_table.schema

In [None]:
proposals_table

In [None]:
proposal_contents = None

In [None]:
# Proposal content
proposal_contents = proposals.select(
    refId=pw.this.refId,
    # content=pw.apply_with_type(lambda x: f"{x}".encode() if x else b"", bytes, pw.this.data.get("content", default=None)),
    content=parser(pw.apply_with_type(lambda x: f"{x.as_str()}".encode() if x else b"", bytes, pw.this.data.get("content", default=None))),
)
proposal_contents = proposal_contents.flatten(pw.this.content)
# # proposals = proposals.select(refId=pw.this.refId, title=pw.this.title, text=pw.this.content[0], metadata=pw.this.content[1])
# # proposals = proposals.select(refId=pw.this.refId, title=pw.this.title, text=pw.this.document['text'].as_str(), document=pw.this.document)
# proposals = proposals.select(refId=pw.this.refId, document=pw.this.content)

In [None]:
proposal_contents.schema

#### Partition analysis

In [None]:
from unstructured.file_utils.filetype import detect_filetype, is_json_processable

In [None]:
# TODO:
# For now, all text data is being recognized as txt files instead of md.

In [None]:
# # cheking file type detection during partition

# @pw.udf
# def detect(data: pw.Json) -> str:
#     encoded = data.as_str().encode()
#     filetype = detect_filetype(file=BytesIO(encoded))
#     return str(filetype)
    

# meta = proposals.select(
#     metadata=pw.this.data.get("content", default=None)
# )
# meta = meta.with_columns(
#     filetype=detect(pw.this.metadata),
# )
# meta.schema

In [None]:
# meta

In [None]:
class JSONAccumulator(pw.BaseCustomAccumulator):
  def __init__(self, initialData: pw.Json):
    self.data: list[dict] = list()
    self.value: dict = { **initialData.as_dict() }

  @classmethod
  def from_row(self, row):
    [val] = row
    return JSONAccumulator(val)

  def update(self, other):
    self.data.append(other.value)

  def compute_result(self) -> list[dict]:
    return self.data

In [None]:
json_acc = pw.reducers.udf_reducer(JSONAccumulator)

In [None]:
grouped = proposal_contents.groupby(proposal_contents.refId).reduce(proposal_contents.refId, contents=json_acc(proposal_contents.content))

In [None]:
grouped.schema

In [None]:
pw.io.jsonlines.write(grouped, "proposalsc.jsonl")

#### Flatening contents

In [None]:
proposal_contents = proposal_contents.select(
    element_id=pw.this.content.get("element_id", default=pw.Json("")).as_str(),
    text=pw.this.content.get("text", default=pw.Json("")).as_str(),
    metadata=append_parent_id(pw.this.content["metadata"], pw.this.refId),
    type=pw.this.content.get("type", default=pw.Json("")).as_str(),
)

In [None]:
proposal_contents.schema

In [None]:
proposal_contents

#### Joining results

In [None]:
proposals_table = proposals_table.concat_reindex(proposal_contents)
proposals_table.schema

#### Intermediate storage

In [None]:
pw.io.jsonlines.write(proposals_table, "proposals.jsonl")

In [None]:
# pw.debug.compute_and_print(proposals)

### Chunking

In [None]:
from typing import Optional

In [None]:
# More on chunking parameters https://docs.unstructured.io/open-source/core-functionality/chunking
combine_text_under_n_chars: Optional[int] = None
include_orig_elements: Optional[bool] = None
max_characters: Optional[int] = None
multipage_sections: Optional[bool] = None
new_after_n_chars: Optional[int] = None
overlap: Optional[int] = None
overlap_all: Optional[bool] = None

In [None]:
# TODO: UDF for chukning
# call chunk_by_title(...)

### Embeddings

In [None]:
EMBEDDINGS_API_KEY = ""
EMBEDDINGS_MODEL = "Meta-Llama-3-8B-Instruct-Q5_K_M"
EMBEDDINGS_API_BASE = "https://llama3.gaianet.network/v1"

In [None]:
@pw.udf(
        # cache_strategy=pw.udfs.DefaultCache(),
        executor=pw.udfs.async_executor(
            # capacity=5,
            retry_strategy=pw.asynchronous.udfs.FixedDelayRetryStrategy(delay_ms=10000)
        ),
        return_type=str
    )
def embedder_call(document: str) -> str:
    returned = embedding(
        input=[document],
        # dimensions=embedding_dimension,
        api_key=EMBEDDINGS_API_KEY,
        model=EMBEDDINGS_MODEL,
        api_base=EMBEDDINGS_API_BASE,
        custom_llm_provider="openai" # litellm will use the openai.ChatCompletion to make the request
    )

    result = returned.data[0]["embedding"]

    return result

In [None]:
# Notes: data sources should match schema (data: bytes, _metadata: any)

# doc_store = VectorStoreServer(
#     # *data_sources(configuration["sources"]),
#     # *data_sources, # TODO:
#     embedder=embedder,
#     # splitter=splitters.TokenCountSplitter(max_tokens=400),
#     parser=parser,
# )

In [None]:
# index = KNNIndex(
#     enriched_documents.vector, enriched_documents, n_dimensions=embedding_dimension
# )
# ...
# query += query.select(
#     vector=embedder(pw.this.query),
# )

# query_context = query + index.get_nearest_items(
#     query.vector, k=3, collapse_rows=True
# ).select(documents_list=pw.this.chunk)

## Voter

In [None]:
def voter_mapper(raw_data: bytes) -> bytes:
    data = json.loads(raw_data.decode())["data"]
    return json.dumps(
        {
            "address": data["address"],
            "totalVotesCast": data["protocols"][0]['totalVotesCast'],
            "lastVoteCast": data["protocols"][0]["lastVoteCast"],
            "firstVoteCast": data["protocols"][0]["firstVoteCast"],
            "totalPowerCast": data["protocols"][0]["totalPowerCast"],
            "lastCastPower": data["protocols"][0]["lastCastPower"],
            
        }
    ).encode()

class Voter(pw.Schema):
    address: str
    firstVoteCast: int
    lastVoteCast: int
    totalVotesCast: int
    protocolVoteCast: str
    totalPowerCast: float
    lastCastPower: float
    # otherProtocols: str
    
class Voters(pw.Schema):
    data: pw.Json

## TODO: fix this voter -> voters as data : voters[] @santiago
    
voters = pw.io.http.read(
    f"https://api.boardroom.info/v1/protocols/{protocol_name}/voters?key={api_key}",
    method='GET',
    headers={"Accept": "application/json"},
    format="json",
    schema=Voters,
    # response_mapper=voter_mapper
)

voters

In [None]:
x = voters.flatten(voters.data)
x

In [None]:
def map_all_protocols(protocols: pw.Json):
    return "".join(protocol["protocol"].as_str() + ", " for protocol in protocols)

def mapper(protocols: pw.Json):
    for protocol in protocols:
        if protocol["protocol"].as_str() == "aave":
            return protocol

z = x.select(
    protocol=pw.apply(mapper, pw.this.data["protocols"]),
    all_protocols=pw.apply(map_all_protocols, pw.this.data["protocols"]),
)

z

In [None]:
#pw.debug.compute_and_print(x)

## Run workflow

In [None]:
#%%capture --no-display
pw.run()