<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark_streaming/examples/example_1_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Usecase 1
- Defining a sample dataset
- Splitting dataset in many CSVs and uploading them in the input folder in async mode
- Use Spark streaming to read from input folder
- Checking results from query in memory

# Setting up PySpark

In [1]:
%pip install pyspark



In [312]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()
sc = spark.sparkContext

# Reading sample dataset

In [339]:
from pyspark import SparkFiles
url = "https://raw.githubusercontent.com/lucprosa/dataeng-basic-course/main/data/squirrel-data/squirrel-data.csv"

from pyspark.sql.types import *
schema = StructType([
StructField('Area Name',StringType(),True),
StructField('Area ID',StringType(),True),
StructField('Park Name',StringType(),True),
StructField('Park ID', StringType(), True),
StructField('Squirrel ID', StringType(), True),
StructField('Primary Fur Color', StringType(), True),
StructField('Highlights in Fur Color', StringType(), True),
StructField('Color Notes', StringType(), True),
StructField('Location', StringType(), True),
StructField('Above Ground (Height in Feet)', StringType(), True),
StructField('Specific Location', StringType(), True),
StructField('Activities', StringType(), True),
StructField('Interactions with Humans', StringType(), True),
StructField('Squirrel Latitude (DD.DDDDDD)', StringType(), True),
StructField('Squirrel Longitude (-DD.DDDDDD)', StringType(), True)
])

data = sc.textFile(SparkFiles.get("squirrel-data.csv"))

header = data.first()
data = data.filter(lambda line: line != header)

# spark.sparkContext.addFile(url)
# df = spark.read.csv(SparkFiles.get("squirrel-data.csv"), header=True, schema=schema)

df = spark.read.csv(data, header=True, schema=schema)

df.show()

+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|      Area Name|Area ID|          Park Name|Park ID|Squirrel ID|Primary Fur Color|Highlights in Fur Color|Color Notes|    Location|Above Ground (Height in Feet)|Specific Location|          Activities|Interactions with Humans|Squirrel Latitude (DD.DDDDDD)|Squirrel Longitude (-DD.DDDDDD)|
+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|UPPER MANHATTAN|      A|    Fort Tryon Park|     01|    A-01-02|             Gray|                  White|       NULL|Ground Plane| 

In [265]:
import uuid
from pyspark.sql.functions import udf

@udf
def generate_uuid():
  return str(uuid.uuid4())

df = df.withColumn("hash", generate_uuid())

df = df.cache()
df.count()

433

In [266]:
!rm -rf /content/input/
!rm -rf /content/output/
!rm -rf /content/checkpoint/

# Splitting dataset in many CSVs and uploading them in async mode

In [267]:
from pyspark.sql import DataFrame
import time
import asyncio

# Function to split csv into many files (assync)
async def splitDf(df: DataFrame, weight: float, files: int):
  weights = [weight for i in range(files)]
  dfs = df.randomSplit(weights)
  return dfs

# Function to write data as csv (assync)
async def writeFile(dfs: list[DataFrame], path: str, seconds_per_file: int):
  for i in range(len(dfs)):
    df = dfs[i]
    #print(f"Writing file {path}file_{i}.csv with {df.count()} lines")
    df.write.mode("append").format("csv").save(path)
    await asyncio.sleep(seconds_per_file)

async def main(df):
  files = 20
  seconds_per_file = 10
  dfs = await splitDf(df, 1.0, files)
  asyncio.create_task(writeFile(dfs, "/content/input/", seconds_per_file))
  #await writeFile(dfs, "/content/input/", seconds_per_file)


In [268]:
await main(df)

# Read CSVs as streaming

In [None]:
# delete input folder
#! rm -rf /content/input
#! rm -rf /content/checkpoint

In [19]:
# Start read of file stream (csv) from input folder
stream1 = spark.readStream.format('csv').schema(schema).option('header', True).load('/content/input/')

# Check if dataframe is streaming
print(stream1.isStreaming)

# Start write as streaming into memory
query = (stream1.writeStream
.format('memory')
.queryName('my_query')
.outputMode('append')
.start()
)

True


# Checking results using query in memory

In [52]:
spark.sql("select count(1) from my_query").show()

+--------+
|count(1)|
+--------+
|     175|
+--------+



# Stop writeStreaming

In [32]:
query.stop()

In [269]:
# Start read of file stream (csv) from input folder
stream1 = spark.readStream.format('csv').schema(schema).option('header', True).load('/content/input/')

# Check if dataframe is streaming
print(stream1.isStreaming)

# Start write as streaming into memory
query = (stream1.writeStream
.format('parquet')
.option('path', '/content/output')
.option('checkpointLocation', '/content/checkpoint')
.trigger(processingTime='5 seconds')
.outputMode('append')
.start()
)

True


In [287]:
input = spark.read.format("csv").load("/content/input/*")
output = spark.read.format("parquet").load("/content/output/*")

print(f"input - {input.count()}")
print(f"output - {output.count()}")


input - 433
output - 413


In [288]:
!ls -l /content/input | grep .csv | wc -l

20


In [188]:
query.stop

In [289]:
query.lastProgress

{'id': '4518de10-4e7b-44ff-b537-5252a262662d',
 'runId': '9bc35863-be8b-4e5e-b5b0-0200efbc29b3',
 'name': None,
 'timestamp': '2024-11-20T20:06:45.000Z',
 'batchId': 19,
 'numInputRows': 26,
 'inputRowsPerSecond': 5.2,
 'processedRowsPerSecond': 104.0,
 'durationMs': {'addBatch': 145,
  'commitOffsets': 34,
  'getBatch': 6,
  'latestOffset': 37,
  'queryPlanning': 4,
  'triggerExecution': 250,
  'walCommit': 23},
 'stateOperators': [],
 'sources': [{'description': 'FileStreamSource[file:/content/input]',
   'startOffset': {'logOffset': 18},
   'endOffset': {'logOffset': 19},
   'latestOffset': None,
   'numInputRows': 26,
   'inputRowsPerSecond': 5.2,
   'processedRowsPerSecond': 104.0}],
 'sink': {'description': 'FileSink[/content/output]', 'numOutputRows': -1}}

In [244]:
!ps

    PID TTY          TIME CMD
      1 ?        00:00:00 docker-init
      7 ?        00:00:14 node
     17 ?        00:00:01 oom_monitor.sh
     19 ?        00:00:00 run.sh
     21 ?        00:00:02 kernel_manager_
     39 ?        00:00:00 tail
     47 ?        00:00:00 tail
     68 ?        00:00:15 python3 <defunct>
     69 ?        00:00:02 colab-fileshim.
     86 ?        00:00:10 jupyter-noteboo
     87 ?        00:00:03 dap_multiplexer
   2451 ?        00:00:21 python3
   2472 ?        00:00:06 python3
   2561 ?        00:05:46 java
   2621 ?        00:00:02 language_servic
   2627 ?        00:01:29 node
  21416 ?        00:00:00 sleep
  21417 ?        00:00:00 ps


In [290]:
query.status

{'message': 'Waiting for next trigger',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [294]:
query.

[{'id': '4518de10-4e7b-44ff-b537-5252a262662d',
  'runId': '9bc35863-be8b-4e5e-b5b0-0200efbc29b3',
  'name': None,
  'timestamp': '2024-11-20T20:03:31.663Z',
  'batchId': 0,
  'numInputRows': 27,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 93.10344827586208,
  'durationMs': {'addBatch': 155,
   'commitOffsets': 59,
   'getBatch': 12,
   'latestOffset': 28,
   'queryPlanning': 8,
   'triggerExecution': 290,
   'walCommit': 25},
  'stateOperators': [],
  'sources': [{'description': 'FileStreamSource[file:/content/input]',
    'startOffset': None,
    'endOffset': {'logOffset': 0},
    'latestOffset': None,
    'numInputRows': 27,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 93.10344827586208}],
  'sink': {'description': 'FileSink[/content/output]', 'numOutputRows': -1}},
 {'id': '4518de10-4e7b-44ff-b537-5252a262662d',
  'runId': '9bc35863-be8b-4e5e-b5b0-0200efbc29b3',
  'name': None,
  'timestamp': '2024-11-20T20:03:40.000Z',
  'batchId': 1,
  'numInputRows':

In [297]:
c = 0
for i in query.recentProgress:
  c += i['numInputRows']

print(c)

413


In [253]:
df.count()

433

In [300]:
input.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)



In [298]:
output.join(input, output.hash == input.hash, 'leftanti').show()

AttributeError: 'DataFrame' object has no attribute 'hash'

In [255]:
output.count()

413

In [256]:
df.show()


+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|      Area Name|Area ID|          Park Name|Park ID|Squirrel ID|Primary Fur Color|Highlights in Fur Color|Color Notes|    Location|Above Ground (Height in Feet)|Specific Location|          Activities|Interactions with Humans|Squirrel Latitude (DD.DDDDDD)|Squirrel Longitude (-DD.DDDDDD)|
+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|UPPER MANHATTAN|      A|    Fort Tryon Park|     01|    A-01-01|             Gray|                  White|       NULL|Ground Plane| 