In [0]:
%run ../get_user

In [0]:
# Getting the current user
user_email = spark.sql("SELECT current_user()").collect()[0][0]
username = get_username_from_email(user_email)
print(username)

In [0]:
dataset_bucket_name = "revodata-databricks-geospatial"
catalog_name = "geospatial"
schema_name = "zoetermeer"
VOLUME_PATH = f"/Volumes/{catalog_name}/{schema_name}/verwijzing_{username}/"

In [0]:
from typing import List, Dict, Any

file_info_list: List[Any] = dbutils.fs.ls(VOLUME_PATH)
files_data: List[Dict[str, Any]] = []

def list_files_recursive(path: str) -> None:
    for file_info in dbutils.fs.ls(path):
        if file_info.isDir():
            list_files_recursive(file_info.path)
        else:
            files_data.append({
                "path": file_info.path,
                "name": file_info.name,
                "size": file_info.size
            })

list_files_recursive(VOLUME_PATH)

df_files_size = spark.createDataFrame(files_data).withColumn(
    "fid",
    regexp_extract(
        col("path"),
        r"/([^/]+)/[^/]+$",
        1
    )
).filter(col("fid").isin(192308))
display(df_files_size)

In [0]:
# 192350, 192356, 192359, 192366, 192367, 192372


from pyspark.sql.functions import col

df_files = spark.read.format("binaryFile") \
    .option("recursiveFileLookup", "true")\
    .load(VOLUME_PATH)

df_files = df_files.withColumn(
    "fid",
    regexp_extract(
        col("path"),
        r"/([^/]+)/[^/]+$",
        1
    )
).filter(col("fid").isin(192308))



display(df_files)


df_parsed = df_files.select(
    col("fid"),
    col("path").alias("file_name"),
    ai_parse_document(col("content")).alias("parsed_content")
)

# .withColumn(
#     "classification",
#     expr('ai_classify(CAST(parsed_content AS STRING), ARRAY("confidential", "not confidential"))')
# ).withColumn(
#     "summary",
#     expr('ai_summarize(CAST(parsed_content AS STRING), 0)')
# )

df_parsed.write.mode("overwrite").saveAsTable(f"{catalog_name}.{schema_name}.parsed_verwijzing_{username}")

display(df_parsed)