<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark_streaming/examples/example_1_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Usecase 1
- Defining a sample dataset
- Splitting dataset in many CSVs and uploading them in the input folder in async mode
- Use Spark streaming to read from input folder
- Checking results from query in memory

# Setting up PySpark

In [1]:
%pip install pyspark



In [312]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()
sc = spark.sparkContext

# Reading sample dataset

In [525]:
from pyspark import SparkFiles
from pyspark.sql.types import *
import uuid
from pyspark.sql.functions import udf

def get_data(url: str, filename: str, schema) -> DataFrame:
  # read using SparkFiles
  data = sc.textFile(SparkFiles.get(filename))

  # removing header
  header = data.first()
  data = data.filter(lambda line: line != header)

  df = spark.read.csv(data, header=True, schema=schema)
  return df

@udf
def generate_uuid():
  return str(uuid.uuid4())

schema = StructType([
StructField('Area Name',StringType(),True),
StructField('Area ID',StringType(),True),
StructField('Park Name',StringType(),True),
StructField('Park ID', StringType(), True),
StructField('Squirrel ID', StringType(), True),
StructField('Primary Fur Color', StringType(), True),
StructField('Highlights in Fur Color', StringType(), True),
StructField('Color Notes', StringType(), True),
StructField('Location', StringType(), True),
StructField('Above Ground (Height in Feet)', StringType(), True),
StructField('Specific Location', StringType(), True),
StructField('Activities', StringType(), True),
StructField('Interactions with Humans', StringType(), True),
StructField('Squirrel Latitude (DD.DDDDDD)', StringType(), True),
StructField('Squirrel Longitude (-DD.DDDDDD)', StringType(), True)
])

df = get_data(url = "https://raw.githubusercontent.com/lucprosa/dataeng-basic-course/main/data/squirrel-data/squirrel-data.csv", filename = "squirrel-data.csv", schema = schema)
df = df.withColumn("hash", generate_uuid())

df = df.cache()
df.count()

df.show()

+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+--------------------+
|      Area Name|Area ID|          Park Name|Park ID|Squirrel ID|Primary Fur Color|Highlights in Fur Color|Color Notes|    Location|Above Ground (Height in Feet)|Specific Location|          Activities|Interactions with Humans|Squirrel Latitude (DD.DDDDDD)|Squirrel Longitude (-DD.DDDDDD)|                hash|
+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+--------------------+
|UPPER MANHATTAN|      A|    Fort Tryon Park|     01|    A-01-02|     

In [459]:
!rm -rf /content/input/
!rm -rf /content/output/
!rm -rf /content/checkpoint/

# Splitting dataset in many CSVs and uploading them in async mode

In [460]:
from pyspark.sql import DataFrame
import time
import asyncio

# Function to split csv into many files (assync)
async def splitDf(df: DataFrame, weight: float, files: int):
  weights = [weight for i in range(files)]
  dfs = df.randomSplit(weights)
  return dfs

# Function to write data as csv (assync)
async def writeFile(dfs: list[DataFrame], path: str, seconds_per_file: int):
  for i in range(len(dfs)):
    df = dfs[i]
    #print(f"Writing file {path}file_{i}.csv with {df.count()} lines")
    df.write.mode("append").format("csv").save(path)
    await asyncio.sleep(seconds_per_file)

async def main(df):
  files = 20
  seconds_per_file = 10
  dfs = await splitDf(df, 1.0, files)
  asyncio.create_task(writeFile(dfs, "/content/input/", seconds_per_file))


# Start producer



In [461]:
await main(df)

# Start streaming (format MEMORY)

In [524]:
from pyspark.sql.streaming import StreamingQuery

def streaming_1() -> StreamingQuery:
  if query.isActive:
    query.stop()

  # Start read of file stream (csv) from input folder
  stream1 = spark.readStream.format('csv').schema(schema).option('header', True).load('/content/input/')

  # Check if dataframe is streaming
  print(stream1.isStreaming)

  # Start write as streaming into memory
  query = (stream1.writeStream
  .format('memory')
  .queryName('my_query')
  .outputMode('append')
  .start()
  )
  return query

def streaming_2() -> StreamingQuery:
  if query.isActive:
    query.stop()

  # Start read of file stream (csv) from input folder
  stream1 = spark.readStream.format('csv').schema(schema).option('header', False).load('/content/input/')

  # Check if dataframe is streaming
  print(stream1.isStreaming)

  # Start write as streaming into memory
  query = (stream1.writeStream
  .format('parquet')
  .option('path', '/content/output')
  .option('checkpointLocation', '/content/checkpoint')
  .trigger(processingTime='5 seconds')
  .outputMode('append')
  .start()
  )
  return query

In [None]:
query = streaming_1()

# Checking results using query in memory

In [521]:
spark.sql("select count(1) from my_query").show()

+--------+
|count(1)|
+--------+
|     412|
+--------+



# Stop writeStreaming

In [32]:
query.stop()

# Start streaming (format PARQUET)

True


# Checking output

In [432]:
df.count()

432

In [497]:
input = spark.read.format("csv").load("/content/input/*")
output = spark.read.format("parquet").schema(schema).load("/content/output/*")

print(f"input - {input.count()}")
print(f"output - {output.count()}")


input - 432
output - 432


In [470]:
query.stop()

In [366]:
# Counting csv files in INPUT folder
!ls -l /content/input | grep .csv | wc -l

20


In [370]:
# Counting csv files in OUTPUT folder
!ls -l /content/output | grep .parquet | wc -l

19


In [188]:
query.stop

In [415]:
query.lastProgress
query.lastProgress['numInputRows']

0

In [416]:
query.recentProgress

[{'id': 'c3f0969a-698e-4cce-8751-d1b85e373cb2',
  'runId': 'b91b4c62-933c-4863-97ff-11a3d76f2ddd',
  'name': None,
  'timestamp': '2024-11-20T21:31:25.001Z',
  'batchId': 19,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'latestOffset': 3, 'triggerExecution': 3},
  'stateOperators': [],
  'sources': [{'description': 'FileStreamSource[file:/content/input]',
    'startOffset': {'logOffset': 18},
    'endOffset': {'logOffset': 18},
    'latestOffset': None,
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0}],
  'sink': {'description': 'FileSink[/content/output]', 'numOutputRows': -1}},
 {'id': 'c3f0969a-698e-4cce-8751-d1b85e373cb2',
  'runId': 'b91b4c62-933c-4863-97ff-11a3d76f2ddd',
  'name': None,
  'timestamp': '2024-11-20T21:31:40.000Z',
  'batchId': 19,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'latestOffset': 5, 'triggerExecution': 5},


In [439]:
query.status

{'message': 'Terminated with exception: [UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE] The CSV datasource doesn\'t support the column `durationMs` of the type "STRUCT<addBatch: INT, commitOffsets: INT, getBatch: INT, latestOffset: INT, queryPlanning: INT, triggerExecution: INT, walCommit: INT>".',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [417]:
c = 0
for i in query.recentProgress:
  c += i['numInputRows']

print(c)

0


In [418]:
df.count()

432

In [424]:
input.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)



In [427]:
output.printSchema()

root
 |-- id: string (nullable = true)
 |-- runId: string (nullable = true)
 |-- name: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- batchId: string (nullable = true)
 |-- numInputRows: string (nullable = true)
 |-- inputRowsPerSecond: string (nullable = true)
 |-- processedRowsPerSecond: string (nullable = true)
 |-- durationMs: struct (nullable = true)
 |    |-- addBatch: integer (nullable = true)
 |    |-- commitOffsets: integer (nullable = true)
 |    |-- getBatch: integer (nullable = true)
 |    |-- latestOffset: integer (nullable = true)
 |    |-- queryPlanning: integer (nullable = true)
 |    |-- triggerExecution: integer (nullable = true)
 |    |-- walCommit: integer (nullable = true)

