In [16]:
from io import BytesIO
import tempfile
from typing import List

from dagster import AssetExecutionContext, asset, EnvVar
from dagster_aws.s3 import S3Resource

import os 
from dagster_project.partitions import password_archive_partitions_def
from dagster_project.utils import copy_archive_to_s3, get_objects
from pyiceberg.catalog import load_catalog
import polars as pl
LEAKS_BUCKET='leaks'
RAW_BUCKET='raw'
FOLDER_PATH='Cit0/Cit0day.in_special_for_xss.is/Cit0day Prem [_special_for_xss.is]'

secret_access_key = os.getenv("SOURCES__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY")
access_key_id = os.getenv("SOURCES__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID")
region_name = 'us-east-1'  # Correct the region name
endpoint_url = os.getenv("SOURCES__FILESYSTEM__CREDENTIALS__AWS_S3_ENDPOINT")

s3 = S3Resource(
    endpoint_url=endpoint_url,
    use_ssl=False,
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key
)


import pyarrow as pa
pa_strings = pa.string()
password_files_pyarrow_schema = pa.schema([
    ('email', pa_strings),
    ('data', pa_strings),
    ('bucket', pa_strings),
    ('prefix', pa_strings),
]
)
password_files_polars_schema = pl.Schema({
    "email": pl.String(),
    "data": pl.String(),
    "bucket": pl.String(),
    "prefix": pl.String()
    })

catalog = load_catalog("default",
        **{
            "warehouse": "s3://iceberg/warehouse",
            "uri": "https://nessie.local.reinthal.cc/iceberg",
            "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
            "s3.endpoint": EnvVar("DESTINATION__FILESYSTEM__CREDENTIALS__AWS_S3_ENDPOINT").get_value(),
            "s3.access-key-id": EnvVar("DESTINATION__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID").get_value(),
            "s3.secret-access-key": EnvVar("DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY").get_value(),
            "type": "rest"
    })

catalog.create_namespace_if_not_exists("staging")

table = catalog.create_table_if_not_exists("staging.cit0day_password_files", schema=password_files_pyarrow_schema)

In [17]:
upstream_archive = 'extracted/Cit0day Prem [_special_for_xss.is]/03designscommunications.ca {2.044} [NOHASH] (NoCategory)_special_for_XSS.IS/'

In [18]:
objs = get_objects(source_bucket=RAW_BUCKET, prefix=upstream_archive, s3=s3)

In [19]:
for obj in objs:
    # - file name
    file_name = obj
    # download the file
    file_obj = BytesIO()
    s3.get_client().download_fileobj(RAW_BUCKET, file_name, file_obj)
    df = pl.read_csv(file_obj, has_header=False, separator=":", schema=password_files_polars_schema)
    
    pa_df = df.with_columns(
        (pl.lit(RAW_BUCKET)).alias("bucket"),
        (pl.lit(file_name)).alias("prefix")
    ).to_arrow()
    table.append(pa_df)

extracted/Cit0day Prem [_special_for_xss.is]/03designscommunications.ca {2.044} [NOHASH] (NoCategory)_special_for_XSS.IS/03designscommunications.ca {2.044} [NOHASH].txt
