In [2]:
from pyspark.sql.functions import to_timestamp, col, lit, input_file_name, udf
import re

# Function to extract date from filename
def extract_date_from_filename(filename):
    match = re.search(r'(\d{4}-\d{2})', filename)
    return match.group(1) if match else None

# Register the function as a UDF
extract_date_udf = udf(extract_date_from_filename)

# Load all CSV files in the LDAP folder
folder_path = "abfss://cyber-filesystem@cyberdatastore.dfs.core.windows.net/LDAP/*.csv"

# Read all CSV files in the folder
df = spark.read.option("header", "true").csv(folder_path)

# Extract date from filename and add as a new column
df = df.withColumn("date", extract_date_udf(input_file_name()))
df = df.withColumn("date", to_timestamp("date", "MM/dd/yyyy HH:mm:ss"))

# Clean column names
df_clean = df.select([col(c).alias(c.strip()) for c in df.columns if c.strip() != ""])
df_clean = df_clean.withColumnRenamed("user_id", "user")

# Save the cleaned DataFrame as a table
df_clean.write.mode("overwrite").saveAsTable("clean_user_details")

StatementMeta(cybersparkpool, 37, 3, Finished, Available, Finished)