<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark_streaming/examples/1-read_write_stream.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Usecase 2
- Reading data from "rate"
- Aggregating data by window time
- Checking results from query in memory

# Setting up PySpark

In [None]:
%pip install pyspark



In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').config('spark.ui.port', '4050').getOrCreate()

In [None]:
import pyspark.sql.functions as F

# read stream
stream1 = spark.readStream.format("rate").option("rowsPerSecond", 10).load()

# transform
transformed = stream1.withColumn("minute", F.minute("timestamp"))
agg = transformed.groupBy(F.window(transformed.timestamp, "5 seconds")).count()

# write stream
query = (agg.writeStream
.format('memory')
.queryName('my_query')
.outputMode('complete')
.start()
)

In [None]:
spark.sql("select * from my_query order by window desc").show(10,False)

+------------------------------------------+-----+
|window                                    |count|
+------------------------------------------+-----+
|{2024-11-05 18:12:25, 2024-11-05 18:12:30}|29   |
|{2024-11-05 18:12:20, 2024-11-05 18:12:25}|50   |
|{2024-11-05 18:12:15, 2024-11-05 18:12:20}|50   |
|{2024-11-05 18:12:10, 2024-11-05 18:12:15}|50   |
|{2024-11-05 18:12:05, 2024-11-05 18:12:10}|50   |
|{2024-11-05 18:12:00, 2024-11-05 18:12:05}|50   |
|{2024-11-05 18:11:55, 2024-11-05 18:12:00}|50   |
|{2024-11-05 18:11:50, 2024-11-05 18:11:55}|50   |
|{2024-11-05 18:11:45, 2024-11-05 18:11:50}|50   |
|{2024-11-05 18:11:40, 2024-11-05 18:11:45}|50   |
+------------------------------------------+-----+
only showing top 10 rows



In [None]:
query.stop()

In [None]:

# Tasks:
## Create/extract column "minute" from the timestamp
## Save output as JSON without partitioning / or partitioned by minute

# Questions:
##
##



In [54]:
!pip install faker

Collecting faker
  Downloading Faker-33.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.0.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-33.0.0


In [68]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from faker import Faker

def enrich_df(df: DataFrame) -> DataFrame:
  fake = Faker()
  new_columns = {
      'name': F.lit(fake.name()),
      'address': F.lit(fake.address()),
      'email': F.lit(fake.email()),
      'dob': F.lit(fake.date_of_birth()),
      'phone': F.lit(fake.phone_number())
  }
  df = df.withColumns(new_columns)
  return df

# read stream
df_stream = spark.readStream.format("rate").option("rowsPerSecond", 10).load()

# write stream
query = (df_stream.writeStream
.format('memory')
.queryName('enriched')
.outputMode('append')
.start()
)

AttributeError: 'DataStreamReader' object has no attribute 'transform'

In [52]:
query.stop()

In [66]:
spark.sql("select * from enriched").show()

+--------------------+-----+----------+--------------------+--------------------+----------+------------+
|           timestamp|value|      name|             address|               email|       dob|       phone|
+--------------------+-----+----------+--------------------+--------------------+----------+------------+
|2024-11-19 20:32:...|    0|John Silva|21383 Davis Green...|isabelsalas@examp...|1942-01-04|775.239.3916|
|2024-11-19 20:32:...|    1|John Silva|21383 Davis Green...|isabelsalas@examp...|1942-01-04|775.239.3916|
|2024-11-19 20:32:...|    2|John Silva|21383 Davis Green...|isabelsalas@examp...|1942-01-04|775.239.3916|
|2024-11-19 20:32:...|    3|John Silva|21383 Davis Green...|isabelsalas@examp...|1942-01-04|775.239.3916|
|2024-11-19 20:32:...|    4|John Silva|21383 Davis Green...|isabelsalas@examp...|1942-01-04|775.239.3916|
|2024-11-19 20:32:...|    5|John Silva|21383 Davis Green...|isabelsalas@examp...|1942-01-04|775.239.3916|
|2024-11-19 20:32:...|    6|John Silva|21383 D

In [67]:
query.stop()