In [0]:
#Task:1. Creating a Databricks Notebook and perform basic operation 
numbers = [5, 10, 15, 20]
total = sum(numbers)
average = total / len(numbers)
print(f"Total: {total}, Average: {average}")


Total: 50, Average: 12.5


In [0]:
# Task: 2. Setting Up Azure Databricks Workspace and Configuring Clusters
print("Cluster is successfully configured!!!")

Cluster is successfully configured!!!


In [0]:
dbutils.fs.cp("file:/Workspace/Shared/real_time_streaming_data.csv","dbfs:/FileStore/real_time_streaming_data.csv")

True

In [0]:
dbutils.fs.cp("file:/Workspace/Shared/exe_sales_data.csv","dbfs:/FileStore/exe_sales_data.csv")

True

In [0]:
dbutils.fs.cp("file:/Workspace/Shared/exe_transaction_data.csv","dbfs:/FileStore/exe_transaction_data.csv")

True

In [0]:
# Task : 3. Real-time data processing with databricks and Real-time aggregation
schema="event_time TIMESTAMP, event_type STRING, user_id STRING, amount DOUBLE"
streamind_data=spark.readStream.format("csv").schema(schema).option("header","true").load("dbfs:/FileStore/")
aggregated_data=(streamind_data.groupBy("event_type").agg({"amount":"sum"}).withColumnRenamed("sum(amount)","total_amount"))
query = (aggregated_data.writeStream
         .outputMode("complete")
         .format("console")
         .start())

In [0]:
# Task: 4 Data Exploration and Visualization in Databricks and perform EDA
from pyspark.sql.functions import col
df = spark.read.csv("dbfs:/FileStore/exe_sales_data.csv", header=True, inferSchema=True)
df.groupBy("category").sum("amount").display()
df.select("amount", "quantity").display()



In [0]:
import matplotlib.pyplot as plt

# Convert the total sales per product to Pandas
df_pandas_category = df.groupBy("Product").agg({"Price": "sum"}).toPandas()

# Plot using Matplotlib
df_pandas_category.plot(kind='bar', x='Product', y='sum(Price)', legend=False)

# Customize the plot
plt.title("Total Sales by Product")
plt.xlabel("Product")
plt.ylabel("Total Sales (Price)")
plt.xticks(rotation=45, ha='right')
plt.show()


In [0]:
# Convert the necessary columns to Pandas
df_pandas_scatter = df.select("Quantity", "Price").toPandas()

# Plot using Matplotlib
plt.scatter(df_pandas_scatter['Quantity'], df_pandas_scatter['Price'], alpha=0.5)

# Customize the plot
plt.title("Quantity vs Price")
plt.xlabel("Quantity")
plt.ylabel("Price")
plt.grid(True)
plt.show()


In [0]:
# Convert the total sales per product to Pandas
df_pandas_customer = df.groupBy("Product").agg({"Price": "sum"}).toPandas()

# Plot using Matplotlib
df_pandas_customer.plot(kind='bar', x='Product', y='sum(Price)', legend=False)

# Customize the plot
plt.title("Total Sales by Product")
plt.xlabel("Product")
plt.ylabel("Total Sales (Price)")
plt.xticks(rotation=45, ha='right')
plt.show()


In [0]:
# Group by product, sum the quantity and price, and convert to Pandas
df_pandas_quantity_vs_sales = df.groupBy("Product") \
    .agg({"Quantity": "sum", "Price": "sum"}).toPandas()

# Plot using Matplotlib
plt.scatter(df_pandas_quantity_vs_sales['sum(Quantity)'], df_pandas_quantity_vs_sales['sum(Price)'])

# Customize the plot
plt.title("Total Quantity vs Total Sales by Product")
plt.xlabel("Total Quantity")
plt.ylabel("Total Sales (Price)")
plt.grid(True)
plt.show()


In [0]:
# Task:5. Reading and Writing Data in Databricks in different formats 
df_csv = spark.read.csv("dbfs:/FileStore/exe_sales_data.csv", header=True, inferSchema=True)
df_csv.write.format("delta").mode("overwrite").save("dbfs:/FileStore/delta_table")
print("Writing to the delta table completed")
df_csv.write.mode("overwrite").parquet("dbfs:/FileStore/parquet_table")
print("Writing to the parquet file completed")
df_csv.write.mode("overwrite").json("dbfs:/FileStore/json_table")
print("Writing to the json file completed")

In [0]:
# Task: 6. Analyzing and Visualizing Streaming Data with Databricks
streaming_data = spark.readStream.csv("dbfs:/FileStore/real_time_streaming_data.csv", schema=schema)
aggregated_stream = streaming_data.groupBy("event_time").sum("amount")
display(aggregated_stream)


In [0]:
#  Task:7. Introduction to Databricks Delta Lake (create and update delta table)
df.write.mode("overwrite").format("delta").save("/delta/transactions")
spark.sql("UPDATE delta.`/delta/transactions` SET amount = amount * 1.1 WHERE customer_id = 9363")
previous_version = spark.read.format("delta").option("versionAsOf", 1).load("/delta/transactions")
previous_version.show()


In [0]:
#Task: 8. Managed and Unmanaged Tables
delta_table_path = "dbfs:/FileStore/delta_table"
df_csv.write.format("delta").mode("overwrite").save(delta_table_path)
history_df = spark.sql("DESCRIBE HISTORY sales_data")
history_df.show(truncate=False)
spark.sql('VACUUM sales_data RETAIN 168 HOURS')
historical_version_df = spark.read.format('delta').option('versionAsOf', 1).table('sales_data')
historical_version_df.show(truncate=False)

In [0]:
# Task:9 Managed and Unmanaged Tables
df.write.saveAsTable("managed_table")
df.write.option("dbfs:/FileStore/exe_sales_data", "/mnt/sales_data").saveAsTable("unmanaged_sales")
spark.sql("SELECT * FROM managed_table").show()

In [0]:
# Task:10 Create Views and Temporary Views
df.createOrReplaceTempView("temp_view")
df.createOrReplaceGlobalTempView("global_temp_view")
spark.sql("SELECT * FROM temp_view").show()
spark.sql("SELECT * FROM global_temp.global_temp_view").show()
