In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("SvnLocalSpark") \
    .master("local")\
    .config("spark.sql.parquet.int96RebaseModeInWrite","CORRECTED")\
    .config("spark.sql.parquet.datetimeRebaseModeInWrite","CORRECTED")\
    .getOrCreate()

print(f"spark {spark.version} {spark.sparkContext.uiWebUrl}")

spark 3.5.4 http://DESKTOP-4GOMK6M:4040


In [2]:
# read csv, not inferring
df = spark.read.option("header", True).csv("../resources/sourcedata/test_dates.csv")
df.printSchema()
df.show(truncate=30)

root
 |-- name: string (nullable = true)
 |-- date1: string (nullable = true)
 |-- datetime1: string (nullable = true)

+------+----------+--------------------------+
|  name|     date1|                 datetime1|
+------+----------+--------------------------+
| first|2024-10-30|2024-11-02 23:11:20.562192|
|second|2024-11-02|2024-11-04 02:01:03.001230|
| third|1899-12-10|2024-11-04 03:42:11.501230|
|fourth|1923-12-01|1899-12-10 03:45:31.654109|
| fifth|1411-02-01|2024-12-31 23:59:59.999999|
| sixth|0001-01-01|2024-12-31 23:59:59.999999|
+------+----------+--------------------------+



In [3]:
# read csv, inferring types
df = spark.read.option("header", True).option("inferSchema", True).csv("../resources/sourcedata/test_dates.csv")
df.printSchema()
df.show(truncate=30)

root
 |-- name: string (nullable = true)
 |-- date1: date (nullable = true)
 |-- datetime1: timestamp (nullable = true)

+------+----------+--------------------------+
|  name|     date1|                 datetime1|
+------+----------+--------------------------+
| first|2024-10-30|2024-11-02 23:11:20.562192|
|second|2024-11-02| 2024-11-04 02:01:03.00123|
| third|1899-12-10| 2024-11-04 03:42:11.50123|
|fourth|1923-12-01|1899-12-10 03:45:31.654109|
| fifth|1411-02-01|2024-12-31 23:59:59.999999|
| sixth|0001-01-01|2024-12-31 23:59:59.999999|
+------+----------+--------------------------+



In [4]:
# save as parquet
df.write.mode('overwrite')\
    .parquet("../resources/generated/write_test/test_dates.parquet")

In [5]:
# read from parquet
rst = spark.read.parquet("../resources/generated/write_test/test_dates.parquet")
rst.printSchema()
rst.show(truncate=30)

root
 |-- name: string (nullable = true)
 |-- date1: date (nullable = true)
 |-- datetime1: timestamp (nullable = true)

+------+----------+--------------------------+
|  name|     date1|                 datetime1|
+------+----------+--------------------------+
| first|2024-10-30|2024-11-02 23:11:20.562192|
|second|2024-11-02| 2024-11-04 02:01:03.00123|
| third|1899-12-10| 2024-11-04 03:42:11.50123|
|fourth|1923-12-01|1899-12-10 03:45:31.654109|
| fifth|1411-02-01|2024-12-31 23:59:59.999999|
| sixth|0001-01-01|2024-12-31 23:59:59.999999|
+------+----------+--------------------------+

