In [5]:
from pyspark.sql import SparkSession

# Initialize Spark session with Delta Lake package
spark = SparkSession.builder \
    .appName("DeltaLakeExample") \
    .master("spark://dbms-spark-master:7077") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0,org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://dbms-minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minio_user") \
    .config("spark.hadoop.fs.s3a.secret.key", "minio_password") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()


In [8]:
minio_bucket="cleaned-bucket"
csv_path = f"s3a://{minio_bucket}/cv_from_text/transform_cv.csv"

# Read the CSV file
df = spark.read.csv(csv_path, header=True, inferSchema=True)

# Show the data
df.show()

+------------+--------------------+--------------------+--------------------+
|    Category|               Skill|             Company|             Project|
+------------+--------------------+--------------------+--------------------+
|Data Science|['javascript', 'j...|   Ernst & Young LLP|[{'Project': 'Cor...|
|Data Science|['python', 'stats...|            Matelabs|                  []|
|Data Science|['analysis', 'exc...|      THEMATHCOMPANY|                  []|
|Data Science|['programming', '...|        Deloitte USI|                  []|
|Data Science|['structure', 'c'...|          Itechpower|                  []|
|Data Science|['office', 'c', '...|                NULL|                  []|
|Data Science|['learning', 'pyt...|Heretic Solutions...|                  []|
|Data Science|['numpy', 'learni...|  Wipro Technologies|                  []|
|Data Science|                  []|Life Insurance Co...|                  []|
|Data Science|['algorithms', 'b...|   IBM India pvt ltd|        

In [9]:
delta_table = "unified_cv"
table_location = f"s3a://{minio_bucket}/unified/{delta_table}"
df.write.format("delta").mode("append").save(table_location)