In [2]:
from pyspark.sql import SparkSession

# Make sure you have a running Spark session
spark = SparkSession.builder.appName("ParquetSQLDemo").getOrCreate()

# Create a sample DataFrame
data = [
    ("Alice", "New York", 100),
    ("Bob", "Los Angeles", 250),
    ("Charles", "New York", 75),
    ("David", "Chicago", 150)
]
columns = ["name", "city", "sale_amount"]
df = spark.createDataFrame(data, columns)

# Write the DataFrame to a Parquet file
df.write.parquet("sales.parquet", mode="overwrite")
print("sales.parquet file created.")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/07 11:12:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 0:>                                                          (0 + 4) / 4]

sales.parquet file created.


                                                                                

In [3]:
# Read the Parquet file into a new DataFrame
sales_df = spark.read.parquet("sales.parquet")

# Register the DataFrame as a temporary SQL view
sales_df.createOrReplaceTempView("sales_data")
print("Temporary view 'sales_data' created.")


Temporary view 'sales_data' created.


In [4]:
# Now, use Spark SQL to run queries against the view
# Example 1: Select all data
print("Querying all data:")
spark.sql("SELECT * FROM sales_data").show()


Querying all data:
+-------+-----------+-----------+
|   name|       city|sale_amount|
+-------+-----------+-----------+
|Charles|   New York|         75|
|    Bob|Los Angeles|        250|
|  Alice|   New York|        100|
|  David|    Chicago|        150|
+-------+-----------+-----------+



In [5]:
# Example 2: Filter the data
print("Filtering data for sales over 120:")
spark.sql("SELECT name, sale_amount FROM sales_data WHERE sale_amount > 120").show()

Filtering data for sales over 120:
+-----+-----------+
| name|sale_amount|
+-----+-----------+
|  Bob|        250|
|David|        150|
+-----+-----------+



In [7]:
# Example 3: Perform an aggregation
print("Grouping by city and calculating total sales:")
spark.sql("SELECT city, sum(sale_amount) AS total_sales FROM sales_data GROUP BY city ORDER BY total_sales DESC").show()


Grouping by city and calculating total sales:
+-----------+-----------+
|       city|total_sales|
+-----------+-----------+
|Los Angeles|        250|
|   New York|        175|
|    Chicago|        150|
+-----------+-----------+



In [9]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

# Create a SparkSession builder
builder = SparkSession.builder \
    .appName("DeltaLakeDemo") \
    .master("local[*]") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Configure the builder with Delta Lake settings from the installed delta-spark pip package
spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("Spark session with Delta support created!")

# The rest of your code can follow directly
# Create a sample DataFrame
data = [("Alice", "New York", 100), ("Bob", "Los Angeles", 250), ("Charles", "New York", 75), ("David", "Chicago", 150)]
columns = ["name", "city", "sale_amount"]
df = spark.createDataFrame(data, columns)

# Write the DataFrame to a Parquet file
parquet_path = "sales.parquet"
df.write.parquet(parquet_path, mode="overwrite")
print("sales.parquet file created.")

# Now, convert to Delta Lake
delta_path = "sales_delta"
spark.read.parquet(parquet_path).write.format("delta").mode("overwrite").save(delta_path)

print(f"Parquet file converted to Delta table at {delta_path}")

# Verify that you can read the Delta table
spark.read.format("delta").load(delta_path).show()


25/09/07 11:17:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Spark session with Delta support created!


                                                                                

sales.parquet file created.


Py4JJavaError: An error occurred while calling o102.save.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: delta. Make sure the provider name is correct and the package is properly registered and compatible with your Spark version. SQLSTATE: 42K02
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:722)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:681)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:740)
	at org.apache.spark.sql.classic.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:626)
	at org.apache.spark.sql.classic.DataFrameWriter.saveInternal(DataFrameWriter.scala:135)
	at org.apache.spark.sql.classic.DataFrameWriter.save(DataFrameWriter.scala:118)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.ClassNotFoundException: delta.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:587)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:520)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$6(DataSource.scala:665)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:665)
	at scala.util.Failure.orElse(Try.scala:230)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:665)
	... 16 more


In [10]:
from delta.tables import DeltaTable

# The path to your existing Parquet file directory
parquet_path = "sales.parquet"
delta_path = "sales_delta"

# Read the Parquet file into a DataFrame
sales_df = spark.read.parquet(parquet_path)

# Write the DataFrame as a Delta table
sales_df.write.format("delta").mode("overwrite").save(delta_path)

print(f"Parquet file converted to Delta table at {delta_path}")

Py4JJavaError: An error occurred while calling o111.save.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: delta. Make sure the provider name is correct and the package is properly registered and compatible with your Spark version. SQLSTATE: 42K02
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:722)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:681)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:740)
	at org.apache.spark.sql.classic.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:626)
	at org.apache.spark.sql.classic.DataFrameWriter.saveInternal(DataFrameWriter.scala:135)
	at org.apache.spark.sql.classic.DataFrameWriter.save(DataFrameWriter.scala:118)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.ClassNotFoundException: delta.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:587)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:520)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$6(DataSource.scala:665)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:665)
	at scala.util.Failure.orElse(Try.scala:230)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:665)
	... 16 more
