In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session with Delta Lake package
spark = SparkSession.builder \
    .appName("ImportCSVLakehouse") \
    .master("spark://dbms-spark-master:7077") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0,org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://dbms-minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minio_user") \
    .config("spark.hadoop.fs.s3a.secret.key", "minio_password") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()


In [2]:
minio_bucket="raw-bucket"
csv_path = f"s3a://{minio_bucket}/image_cv/CV.csv"

# Read the CSV file
df = spark.read\
    .option("delimiter", ",")\
    .option("encoding", "UTF-8")\
    .option("escape", "\"") \
    .option("multiline", "true") \
    .csv(csv_path, header=True, inferSchema=True)
# Show the data
df.show(truncate=True)

+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|          Profession|       Category|             Summary|          Educations|         Experiences|              Skills|              Awards|      certifications|           languages|references|
+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|Information Techn...|  IT Technician|Versatile Systems...|[{'start_year': N...|[{'start_date': {...|                  []|                  []|[{'year': None, '...|                  []|        []|
|Information Techn...|  IT Technician|Possesses an exte...|[{'start_year': N...|[{'start_date': {...|['Word', 'Excel',...|                  []|[{'year': None, '...|                  []|        []|
| RF Systems En

In [4]:
delta_table = "unified_cv"
cleaned_minio_bucket = "cleaned-bucket"
table_location = f"s3a://{cleaned_minio_bucket}/unified/{delta_table}"
df.write.format("delta")\
    .option("mergeSchema","true")\
    .mode("append")\
    .save(table_location)

In [5]:
delta_df = spark.read.format("delta").load(table_location)
delta_df.show()

+------------+--------------------+--------------------+--------------------+----------+-------+----------+-----------+------+------+--------------+---------+----------+
|    Category|               Skill|             Company|             Project|Profession|Summary|Educations|Experiences|Skills|Awards|certifications|languages|references|
+------------+--------------------+--------------------+--------------------+----------+-------+----------+-----------+------+------+--------------+---------+----------+
|Data Science|[javascript, jque...|   Ernst & Young LLP|[{Core member of ...|      NULL|   NULL|      NULL|       NULL|  NULL|  NULL|          NULL|     NULL|      NULL|
|Data Science|[python, statsmod...|            Matelabs|                  []|      NULL|   NULL|      NULL|       NULL|  NULL|  NULL|          NULL|     NULL|      NULL|
|Data Science|[analysis, excel,...|      THEMATHCOMPANY|                  []|      NULL|   NULL|      NULL|       NULL|  NULL|  NULL|          NULL|  