**How It Works**  
1. Spark watches a specified directory for **new text files**.  
2. Whenever a new file appears, the lines are read and processed.  
3. This uses Spark Streaming’s `textFileStream()` to ingest the data

In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import time

# 1. Create SparkContext and StreamingContext
sc = SparkContext("local[2]", "FileStreamWordCount")
ssc = StreamingContext(sc, 5)  # batch interval of 5 seconds

# 2. Set checkpoint directory (required by streaming window operations)
ssc.checkpoint("checkpoint-directory")

# 3. Define the file-based streaming source
#    Replace '/path/to/watch' with an actual directory to watch for new files
lines = ssc.textFileStream("file:///path/to/watch")

# 4. Split each line into words
words = lines.flatMap(lambda line: line.split())

# 5. Simple word count
wordCounts = words.map(lambda w: (w, 1)).reduceByKey(lambda x, y: x + y)

# 6. Print and optionally save the results
wordCounts.pprint()
wordCounts.saveAsTextFiles("output/wordcount")

# 7. (Optional) Windowed word count (10s window, slides every 5s)
windowedWordCounts = words \
    .map(lambda w: (w, 1)) \
    .reduceByKeyAndWindow(lambda x, y: x + y,
                          lambda x, y: x - y,
                          10, 5)
windowedWordCounts.pprint()

# 8. Start the streaming context
ssc.start()

# 9. Await termination (stop with Ctrl + C in terminal)
ssc.awaitTermination()