# Week 5
Structured Streaming with local files

In [1]:
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = '--master local[2] pyspark-shell'

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

spark = (SparkSession 
    .builder 
     .master("local[*]") 
    .appName("StructuredNetworkWordCount") \
    .getOrCreate()
        )

In [3]:
spark.conf.set("spark.sql.streaming.schemaInference", True)

In [4]:
word_sample = 'data/'

In [5]:
lines = spark \
    .readStream \
    .format("text") \
    .load(word_sample)

# Split the lines into words
words = lines.select(
   explode(
       split(lines.value, " ")
   ).alias("word")
)

# Generate running word count
wordCounts = words.groupBy("word").count()

In [6]:
wordCounts

DataFrame[word: string, count: bigint]

In [7]:
 # Start running the query that prints the running counts to the console
query = wordCounts \
    .writeStream \
    .queryName("word_counts") \
    .outputMode("complete") \
    .format("memory") \
    .start()

#query.awaitTermination()

In [8]:
spark.streams.active

[<pyspark.sql.streaming.query.StreamingQuery at 0x23621e9edd0>]

In [17]:
from time import sleep
for x in range(5):
    spark.sql("select * from word_counts").show(15)
    sleep(1)

+---------+-----+
|     word|count|
+---------+-----+
|     love|    1|
|streaming|    3|
|       is|    2|
|      but|    1|
|    spark|    6|
|   really|    1|
|     hard|    2|
|     like|    1|
|        I|    2|
|    Spark|    2|
+---------+-----+

+---------+-----+
|     word|count|
+---------+-----+
|     love|    1|
|streaming|    3|
|       is|    2|
|      but|    1|
|    spark|    6|
|   really|    1|
|     hard|    2|
|     like|    1|
|        I|    2|
|    Spark|    2|
+---------+-----+

+---------+-----+
|     word|count|
+---------+-----+
|     love|    1|
|streaming|    3|
|       is|    2|
|      but|    1|
|    spark|    6|
|   really|    1|
|     hard|    2|
|     like|    1|
|        I|    2|
|    Spark|    2|
+---------+-----+

+---------+-----+
|     word|count|
+---------+-----+
|     love|    1|
|streaming|    3|
|       is|    2|
|      but|    1|
|    spark|    6|
|   really|    1|
|     hard|    2|
|     like|    1|
|        I|    2|
|    Spark|    2|
+------