In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

spark =SparkSession.builder.appName('appname').getOrCreate()
# Create DataFrame representing the stream of input lines from connection to localhost:9999
streamingDF = spark.readStream\
                .format('socket')\
                .option('host','localhost')\
                .option('port',9999)\
                .load()
# Split the lines into words
words = streamingDF.select(
        explode(
            split(streamingDF.value, ' ')
        ).alias('word')
)
# Generate running word count
wordCounts = words.groupBy('word').count()

In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Step 1: Initialize Spark Session
spark = SparkSession.builder \
    .appName("Streaming Transformation Example") \
    .getOrCreate()

# Step 2: Define the Streaming Source
# Assuming the stream is coming from a directory with continuous input files
input_stream = spark.readStream \
    .format("json") \
    .option("path", "path/to/input/directory") \
    .option("maxFilesPerTrigger", 1) \
    .load()

# Step 3: Define the Transformation
# Example transformation: filtering the data
transformed_stream = input_stream.filter(col("someColumn") > 10)

# Step 4: Output the Transformed Data
# Writing the results to the console
query = transformed_stream.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# Start the stream and wait for it to finish
query.awaitTermination()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a Spark session
spark = SparkSession.builder \
    .appName("Filter Streaming Data") \
    .getOrCreate()

# Read the streaming data from a source
streamingDF = spark.readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9999) \
    .load()

# Apply a filter transformation
filteredDF = streamingDF.filter(col("value") > 100)

# Write the results to the console
query = filteredDF.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.awaitTermination()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Create a Spark session
spark = SparkSession.builder \
    .appName("Aggregate Streaming Data") \
    .getOrCreate()

# Read the streaming data from a source
streamingDF = spark.readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9999) \
    .load()

# Define aggregation
aggregatedDF = streamingDF.groupBy("someGroupingColumn").count()

# Write the results to the console
query = aggregatedDF.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

query.awaitTermination()


In [None]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("Join Stream with Static Data") \
    .getOrCreate()

# Static DataFrame
staticDF = spark.read.json("path/to/static/data.json")

# Streaming DataFrame
streamingDF = spark.readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9999) \
    .load()

# Join operation
joinedDF = streamingDF.join(staticDF, "key")

# Write the results to the console
query = joinedDF.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.awaitTermination()
