Manually Controlling JSONB Serialization/Deserialization #10944
-
I am developing an artifact system that uses a PostgreSQL (psycopg2) class Artifact(Base):
__tablename__ = "artifact"
id = Column(Integer, primary_key=True, autoincrement=True)
content_hash = Column(String, nullable=False)
content_json = Column(StringJSONB, nullable=True) Because I keep track of the content's hash I end up doing something like this when creating def create_artifact(value: Any) -> None:
return Artifact(
content_json=value,
content_hash=hashlib.sha256(json.dumps(value).encode("utf-8")).hexdigest()
) The consequence being that I end up serializing my data twice - once to compute the hash and again when SQLAlchemy automatically serializes the data to store it in the database. It would be much better if I could instead ask SQLAlchemy to not automatically serialize the data to JSON and instead do it myself: def create_artifact(value: Any) -> None:
data = json.dumps(value)
return Artifact(
content_json=data,
content_hash=hashlib.sha256(data.encode("utf-8")).hexdigest()
) Turning off this automatic serialization behavior is possible, but as per the docs, it requires that I disable automatic serialization of JSON values at the engine level. As explained here this is because, "when using psycopg2, the DBAPI only allows serializers at the per-cursor or per-connection level." This is problematic because the rest of my application would benefit from, and in fact assumes, the automatic serialization of JSON values. I attempted to create a custom type decorator that I could substitute for from typing import Any, Callable
import json
from sqlalchemy import Dialect
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.sql.elements import Null
from sqlalchemy.types import TypeDecorator
class StringJSONB(TypeDecorator):
"""JSON that requires the user to manually serialize and deserialize to and from strings"""
impl = JSONB
cache_ok = True
def __init__(self, *args: Any, **kwargs: Any):
super().__init__(*args, none_as_null=False, **kwargs)
def bind_processor(self, dialect: Dialect) -> Callable[[Any], Any]:
return lambda value: None if value is None or value is self.NULL or isinstance(value, Null) else value
def result_processor(self, dialect: Dialect, coltype: Any) -> Callable[[Any], Any]:
return lambda value: (
value
if value is None
# Unfortunately we have to re-serialize the value because the postgres dialect automatically
# deserializes the value into a dict. As far as I can tell, there's no way to disable this behavior.
else json.dumps(value)
) It seems odd that I'm able to successfully skip the automatic JSON serialization by defining a Here is a full working example showing import hashlib
import json
from typing import Any, Callable
from sqlalchemy import Column, Dialect, Integer, String, create_engine
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql.elements import Null
from sqlalchemy.types import TypeDecorator
Base = declarative_base()
class StringJSONB(TypeDecorator):
"""JSON that requires the user to manually serialize and deserialize to and from strings"""
impl = JSONB
cache_ok = True
def __init__(self, *args: Any, **kwargs: Any):
super().__init__(*args, none_as_null=False, **kwargs)
def bind_processor(self, dialect: Dialect) -> Callable[[Any], Any]:
return lambda value: None if value is None or value is self.NULL or isinstance(value, Null) else value
def result_processor(self, dialect: Dialect, coltype: Any) -> Callable[[Any], Any]:
return lambda value: (
value
if value is None
# Unfortunately we have to re-serialize the value because the postgres dialect automatically
# deserializes the value into a dict. As far as I can tell, there's no way to disable this behavior.
else json.dumps(value)
)
class Artifact(Base):
__tablename__ = "artifact"
id = Column(Integer, primary_key=True, autoincrement=True)
content_hash = Column(String, nullable=False)
content_json = Column(StringJSONB, nullable=True)
def create_artifact(value: Any) -> Artifact:
data = json.dumps(value)
return Artifact(content_json=data, content_hash=hashlib.sha256(data.encode("utf-8")).hexdigest())
def main() -> None:
engine = create_engine("postgresql://username:password@localhost:5432/postgres")
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
value = {"foo": "bar"}
artifact = create_artifact(value)
session.add(artifact)
session.commit()
artifact = session.query(Artifact).first()
assert isinstance(artifact.content_json, str)
assert json.loads(artifact.content_json) == value
print("It works!")
if __name__ == "__main__":
main() |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 4 replies
-
Hi,
well you can control what gets passed to the dialect, but once the result_processor runs the dialect has already de-serialized the data from the db. I think yout best option here is probably to either de-register the deserializer in the dialect or update your Personally I would make the db compute that hash, using a computed column and one of the hashing function supported by pg https://www.postgresql.org/docs/current/pgcrypto.html |
Beta Was this translation helpful? Give feedback.
Hi,
well you can control what gets passed to the dialect, but once the result_processor runs the dialect has already de-serialized the data from the db.
I think yout best option here is probably to either de-register the deserializer in the dialect or update your
StringJSONB
type to cast to text inbind_expression
.Personally I would make the db compute that hash, using a computed column and one of the hashing function supported by pg https://www.postgresql.org/docs/current/pgcrypto.html