In [1]:
import polars as pl
import os
from dagster import EnvVar
from pyiceberg.catalog import load_catalog
from dagster_project.schemas import (
    cit0day_partition_spec,
    cit0day_sort_order,
    cit0day_polars_schema,
    cit0day_schema,
)
from dagster_aws.s3 import S3Resource

name = "default"
warehouse = EnvVar("NESSIE_WAREHOUSE").get_value()
branch = EnvVar("NESSIE_BRANCH").get_value()
uri = EnvVar("NESSIE_URI").get_value()
py_io_impl = "pyiceberg.io.pyarrow.PyArrowFileIO"
s3_endpoint = EnvVar(\
    "DESTINATION__FILESYSTEM__CREDENTIALS__AWS_S3_ENDPOINT"\
        ).get_value()
s3_access_key_id = EnvVar(
    "DESTINATION__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID"
).get_value()
s3_secret_access_key = EnvVar(
    "DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY"
).get_value()
catalog_type = "rest"
catalog = load_catalog(
            **{
                "warehouse": warehouse,
                "uri": f"{uri}/{branch}",
                "py-io-impl": py_io_impl,
                "s3.endpoint": s3_endpoint,
                "s3.access-key-id": s3_access_key_id,
                "s3.secret-access-key": s3_secret_access_key,
                "type": catalog_type,
            },
        )

print(branch)

feat-push-data-to-elastic


In [9]:
catalog.list_tables("staging")
table = catalog.drop_table("staging.cit0day_password_files")

In [5]:
nas_minio = S3Resource(
    aws_secret_access_key=os.getenv(
        "SOURCES__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY"
    ),
    aws_access_key_id=os.getenv("SOURCES__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID"),
    endpoint_url=os.getenv("SOURCES__FILESYSTEM__CREDENTIALS__AWS_S3_ENDPOINT"),
)

client = nas_minio.get_client()
obj = client.get_object(Bucket="raw", Key="parquets/f4b12f8a-c762-4bff-b423-06e0bb6b250e.parquet")
df = pl.read_parquet(obj["Body"])

print(f"Polars shape {df.shape}")

pa_df = df.to_arrow()
print(f"Pyarrow shape: {pa_df.shape}")

catalog.create_table_if_not_exists(
        "staging.cit0day_password_files",
        schema=cit0day_schema,
        partition_spec=cit0day_partition_spec,
        sort_order=cit0day_sort_order
)

Polars shape (4830684, 7)
Pyarrow shape: (4830684, 7)


cit0day_password_files(
  1: email: optional string,
  2: username: optional string,
  3: email_domain: optional string,
  4: data: optional string,
  5: bucket: optional string,
  6: prefix: optional string,
  7: category: optional string
),
partition by: [category],
sort order: [1 ASC NULLS FIRST],
snapshot: null

In [6]:
table = catalog.load_table("staging.cit0day_password_files")
res = table.append(pa_df)

# Query the Data

In [7]:
table = catalog.load_table("staging.cit0day_password_files")

In [8]:
con = table.scan().to_duckdb(table_name="cit0day_password_files")
duck_df = pl.DataFrame(con.execute(
        """
            select 
            * from cit0day_password_files
            where email_domain='gmail.com'
        """
    ).fetch_arrow_table())
print(duck_df)



shape: (977_944, 7)
┌───────────────┬──────────────┬──────────────┬──────────────┬────────┬──────────────┬─────────────┐
│ email         ┆ username     ┆ email_domain ┆ data         ┆ bucket ┆ prefix       ┆ category    │
│ ---           ┆ ---          ┆ ---          ┆ ---          ┆ ---    ┆ ---          ┆ ---         │
│ str           ┆ str          ┆ str          ┆ str          ┆ str    ┆ str          ┆ str         │
╞═══════════════╪══════════════╪══════════════╪══════════════╪════════╪══════════════╪═════════════╡
│ 0325maomao@gm ┆ 0325maomao   ┆ gmail.com    ┆ $H$9u2WyOAtb ┆ raw    ┆ extracted/Ci ┆ Auto        │
│ ail.com       ┆              ┆              ┆ QZadYQNzkSGF ┆        ┆ t0day Prem   ┆             │
│               ┆              ┆              ┆ /Ndv84…      ┆        ┆ [_speci…     ┆             │
│ 0670802436m@g ┆ 0670802436m  ┆ gmail.com    ┆ $H$9kETH0pdf ┆ raw    ┆ extracted/Ci ┆ Auto        │
│ mail.com      ┆              ┆              ┆ pGfJY398u9tp ┆        ┆