simple write into kafka


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import struct

# Create SparkSession
spark = SparkSession.builder \
    .appName("PublishToKafka") \
    .getOrCreate()

# Path to the CSV file
csv_file_path = "path/to/T1.csv"

# Read CSV file into DataFrame
df = spark.read.option("header", "true").csv(csv_file_path)

# Convert DataFrame to JSON and select necessary columns
json_df = df.selectExpr("to_json(struct(*)) AS value")

# Define Kafka parameters
kafka_bootstrap_servers = "localhost:9092"
kafka_topic = "your_topic_name"

# Write data to Kafka
query = json_df.write \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("topic", kafka_topic) \
    .save()

# Wait for the query to terminate
query.awaitTermination()


read csv 

In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("ReadCSV") \
    .getOrCreate()

# Path to the CSV file
csv_file_path = "path/to/T1.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(csv_file_path, header=True)

# Show the DataFrame
df.show()


publish into kafka


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_json, struct

# Create a SparkSession
spark = SparkSession.builder \
    .appName("PublishToKafka") \
    .getOrCreate()

# Path to the CSV file
csv_file_path = "path/to/T1.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(csv_file_path, header=True)

# Convert the DataFrame to JSON
json_df = df.select(to_json(struct("*")).alias("value"))

# Define Kafka parameters
kafka_bootstrap_servers = "localhost:9092"
kafka_topic = "your_topic_name"

# Write the JSON data to Kafka
query = json_df.writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("topic", kafka_topic) \
    .start()

# Wait for the query to terminate
query.awaitTermination()


read from kafka back to spaark

In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("ReadFromKafka") \
    .getOrCreate()

# Define Kafka parameters
kafka_bootstrap_servers = "localhost:9092"
kafka_topic = "your_topic_name"

# Read data from Kafka into a DataFrame
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("subscribe", kafka_topic) \
    .load()

# Start the streaming query
query = df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# Wait for the query to terminate
query.awaitTermination()


In [None]:
# Check if the DataFrame is streaming
if df.isStreaming:
    print("DataFrame is streaming. Monitoring for new data...")
else:
    print("DataFrame is not streaming. No new data to monitor.")


write to delta table

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, to_date, current_date, current_timestamp, struct

# Create a SparkSession
spark = SparkSession.builder \
    .appName("WriteToDelta") \
    .getOrCreate()

# Define Kafka parameters
kafka_bootstrap_servers = "localhost:9092"
kafka_topic = "your_topic_name"

# Read data from Kafka into a DataFrame
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("subscribe", kafka_topic) \
    .load()

# Convert value column from binary to string and parse JSON
df = df.selectExpr("CAST(value AS STRING) as value") \
    .select(from_json("value", "signal_date DATE, signal_tc TIMESTAMP, signals MAP<STRING, STRING>").alias("data")) \
    .select("data.signal_date", "data.signal_tc", "data.signals")

# Add create_date and create_ts columns
df = df.withColumn("create_date", current_date()) \
    .withColumn("create_ts", current_timestamp())

# Write the DataFrame to Delta table
delta_table_path = "path/to/delta_table"
query = df.writeStream \  # in lace of writeStream check with write
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", "path/to/checkpoint/dir") \
    .start(delta_table_path)

# Wait for the query to terminate
query.awaitTermination()


In [None]:
# if delta table is not created in the table
# including required package

from pyspark.sql import SparkSession

# Create SparkSession with Delta Lake package
spark = SparkSession.builder \
    .appName("YourAppName") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .getOrCreate()


read delta lake table

In [None]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
    .appName("ReadDeltaLake") \
    .getOrCreate()

# Path to the Delta table
delta_table_path = "path/to/delta_table"

# Read Delta table as DataFrame
df = spark.read.format("delta").load(delta_table_path)

# Show the DataFrame
df.show(5)


distinct datapoints

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, countDistinct

# Create SparkSession
spark = SparkSession.builder \
    .appName("DistinctSignalTcPerDay") \
    .getOrCreate()

# Path to the Delta table
delta_table_path = "path/to/delta_table"

# Read Delta table as DataFrame
df = spark.read.format("delta").load(delta_table_path)

# Extract date and hour from 'signal_tc'
df = df.withColumn("date", date_format(col("signal_tc"), "yyyy-MM-dd"))
df = df.withColumn("hour", date_format(col("signal_tc"), "H"))

# Calculate distinct 'signal_tc' datapoints per hour
distinct_signal_tc_per_hour = df.groupBy("date", "hour").agg(countDistinct("signal_tc").alias("distinct_signal_tc_count"))

# Calculate total distinct 'signal_tc' datapoints per day
distinct_signal_tc_per_day = distinct_signal_tc_per_hour.groupBy("date").sum("distinct_signal_tc_count")

# Show the results
distinct_signal_tc_per_day.show()


avg per hour

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import hour, avg

# Create SparkSession
spark = SparkSession.builder \
    .appName("AverageSignalPerHour") \
    .getOrCreate()

# Path to the Delta table
delta_table_path = "path/to/delta_table"

# Read Delta table as DataFrame
df = spark.read.format("delta").load(delta_table_path)

# Extract hour from 'signal_tc'
df = df.withColumn("hour", hour("signal_tc"))

# Group by hour and calculate average for each signal
average_per_hour = df.groupBy("hour").agg(
    avg("LV ActivePower").alias("avg_LV_ActivePower"),
    avg("Wind Speed").alias("avg_Wind_Speed"),
    avg("Theo_Power_Curve").alias("avg_Theo_Power_Curve"),
    avg("Wind Direction").alias("avg_Wind_Direction")
)

# Show the results
average_per_hour.show()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import hour, col

# Create SparkSession
spark = SparkSession.builder \
    .appName("AverageSignalPerHour") \
    .getOrCreate()

# Path to the Delta table
delta_table_path = "path/to/delta_table"

# Read Delta table as DataFrame
df = spark.read.format("delta").load(delta_table_path)

# Extract hour from 'signal_tc'
df = df.withColumn("hour", hour("signal_tc"))

# Group by hour and calculate the average of all signals combined
average_all_signals_per_hour = df.groupBy("hour").agg(
    (col("LV ActivePower") + col("Wind Speed") + col("Theo_Power_Curve") + col("Wind Direction")) / 4.0
    ).alias("average_all_signals_per_hour")

# Show the results
average_all_signals_per_hour.show()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col

# Create SparkSession
spark = SparkSession.builder \
    .appName("AddGenerationIndicator") \
    .getOrCreate()

# Path to the Delta table
delta_table_path = "path/to/delta_table"

# Read Delta table as DataFrame
df = spark.read.format("delta").load(delta_table_path)

# Add 'generation_indicator' column based on 'LV ActivePower'
df = df.withColumn("generation_indicator", 
                   when(col("LV ActivePower") < 200, "Low")
                   .when((col("LV ActivePower") >= 200) & (col("LV ActivePower") < 600), "Medium")
                   .when((col("LV ActivePower") >= 600) & (col("LV ActivePower") < 1000), "High")
                   .otherwise("Exceptional"))

# Show the DataFrame
df.show()
