In [31]:
import delta
import delta.pip_utils
import delta.tables
import pyspark
import pyarrow as pa
import pyspark.pandas as ps

def get_spark():
    builder = (
        pyspark.sql.SparkSession.builder.appName("lakehouse")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config(
            "spark.sql.catalog.spark_catalog",
            "org.apache.spark.sql.delta.catalog.DeltaCatalog",
        )
    )
    return delta.pip_utils.configure_spark_with_delta_pip(builder).getOrCreate()


In [1]:
import polars as pl
from deltalake import DeltaTable

df = pl.select(pl.datetime(2010, 1, 1, time_unit="us", time_zone="UTC"))

print(df.to_arrow().schema)

df.write_delta('test')
df.write_parquet("test.parquet")

dt = DeltaTable('test')

print(dt.schema().to_pyarrow())


datetime: timestamp[us, tz=UTC]
datetime: timestamp[us]


In [9]:
import pyarrow.parquet as pq

file = "test/0-64c5f7d8-9779-48bc-af49-aa16881115cf-0.parquet"

metadata = pq.read_metadata("test/0-64c5f7d8-9779-48bc-af49-aa16881115cf-0.parquet")
print(metadata.schema)

metadata = pq.read_metadata("test.parquet")
print(metadata.schema)

<pyarrow._parquet.ParquetSchema object at 0x7f87f41e6d00>
required group field_id=-1 schema {
  optional int64 field_id=-1 datetime (Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
}

<pyarrow._parquet.ParquetSchema object at 0x7f87f408ac40>
required group field_id=-1 root {
  optional int64 field_id=-1 datetime (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
}



In [10]:
import pandas as pd

pd.read_parquet("test.parquet").dtypes

datetime    datetime64[us, UTC]
dtype: object

In [32]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from delta.tables import DeltaTable
from pathlib import Path
from pyspark.sql.functions import lit, to_timestamp, col

schema = StructType([
    StructField("integer", IntegerType(), True),
    StructField("string", StringType(), True),
    StructField("timestamp", TimestampType(), True),
])

spark = get_spark()

path = Path.cwd() / "schema"
DeltaTable.create(spark).location(str(path)).addColumns(schema).property(
    "delta.minReaderVersion", "3"
).property("delta.minWriterVersion", "7").execute()

<delta.tables.DeltaTable at 0x7f87c0c29a50>

In [33]:

schema = StructType([
    StructField("integer", IntegerType(), True),
    StructField("string", StringType(), True),
    StructField("timestamp", StringType(), True),
])

spark.createDataFrame(
    [(4, "delta", "2022-06-29 12:01:19.000")], schema=schema
).withColumn("timestamp", col("timestamp").cast("timestamp")).write.save(
    str(path),
    mode="append",
    format="delta",
)

In [34]:
metadata = pq.read_metadata("schema/part-00000-0d310fe0-1c85-4a03-a464-0501acfd6dde-c000.snappy.parquet")
print(metadata.schema)

<pyarrow._parquet.ParquetSchema object at 0x7f87d0862e40>
required group field_id=-1 spark_schema {
  optional int32 field_id=-1 integer;
  optional binary field_id=-1 string (String);
  optional int64 field_id=-1 timestamp (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
}



In [26]:
pq.read_table("schema/part-00023-cda0b9ea-1909-4b03-9e81-a2a87743b903-c000.snappy.parquet")

pyarrow.Table
integer: int32
string: string
timestamp: timestamp[ns]
----
integer: [[4]]
string: [["delta"]]
timestamp: [[2022-06-29 10:01:19.000000000]]