Structured Streaming

In [None]:
from pyspark.shell import spark

# Load the dataset
staticDataFrame = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/datasets/retail-data/by-day/*.csv")

staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema

#display(staticDataFrame)
#display(staticSchema)

In [None]:
from pyspark.sql.functions import window, column, desc, col

# Working with window function
staticDataFrame \
    .selectExpr(
        "CustomerId",
        "(UnitPrice * Quantity) as total_cost",
        "InvoiceDate") \
    .groupBy(
        col("CustomerId"), window(col("InvoiceDate"), "1 day")) \
    .sum("total_cost") \
    .show(5)

In [None]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

# Working the readStream
streamingDataFrame = spark.readStream \
    .schema(staticSchema) \
    .option("maxFilesPerTrigger", 1) \
    .format("csv") \
    .option("header", "true") \
    .load("/datasets/retail-data/by-day/*.csv")

streamingDataFrame.isStreaming

In [21]:
# Perform a summation in the process
purchaseByCustomerPerHour = streamingDataFrame \
    .selectExpr(
        "CustomerId",
        "(UnitPrice * Quantity) as total_cost",
        "InvoiceDate") \
    .groupBy(
        col("CustomerId"), window(col("InvoiceDate"), "1 day")) \
    .sum("total_cost")

In [None]:
# Start the streaming
purchaseByCustomerPerHour.writeStream \
    .format("memory") \
    .queryName("customer_purchases") \
    .outputMode("complete") \
    .start()

In [None]:
# Stream started so run query
spark.sql("""
    SELECT *
    FROM customer_purchases
    ORDER BY 'sum(total_cost)' DESC
""") \
    .show(5)

Machine Learning with K-Means

In [None]:
# Analyzing the Dataframe structure
staticDataFrame.printSchema()

In [12]:
from pyspark.sql.functions import date_format, col

# Transforming data into some numerical representation
preppedDataFrame = staticDataFrame \
    .na.fill(0) \
    .withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE")) \
    .coalesce(5)

In [None]:
# Split the data into training and test sets
trainDataFrame = preppedDataFrame.where("InvoiceDate < '2011-02-01'")

testDataFrame = preppedDataFrame.where("InvoiceDate >= '2011-02-01'")

#trainDataFrame.count()
#testDataFrame.count()

In [None]:
from pyspark.ml.feature import StringIndexer

# Transforming data into some numerical representation
indexer = StringIndexer() \
    .setInputCol("day_of_week") \
    .setOutputCol("day_of_week_index")

In [None]:
from pyspark.ml.feature import OneHotEncoder

# Transforming with OneHotEncoder
encoder = OneHotEncoder() \
    .setInputCol("day_of_week_index") \
    .setOutputCol("day_of_week_encoded")

In [None]:
from pyspark.ml.feature import VectorAssembler

# Transforming with VectorAssembler
vectorAssembler = VectorAssembler() \
    .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"]) \
    .setOutputCol("features")

In [None]:
from pyspark.ml import Pipeline

# Creating a pipeline
transformatPipeline = Pipeline().setStages([indexer, encoder, vectorAssembler])

fittedPipeline = transformatPipeline.fit(trainDataFrame)

transformedTraining = fittedPipeline.transform(trainDataFrame)

transformedTraining.cache()

In [None]:
from pyspark.ml.clustering import KMeans

# Training the model
kmeans = KMeans().setK(20).setSeed(1L)

kmModel = kmeans.fit(transformedTraining)

kmModel.computeCost(transformedTraining)

In [None]:
# Testing the model
transformedTest = fittedPipeline.transform(testDataFrame)

kmModel.computeCost(transformedTest)

Fim