In [None]:
staticDataFrame = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("data/retail-data/by-day/*.csv")

staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema

In [None]:
from pyspark.sql.functions import window, column, desc, col 
staticDataFrame\
    .selectExpr(
        "CustomerId",
        "(UnitPrice * Quantity) as total_cost",
        "InvoiceDate")\
    .groupBy(
        col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
    .sum("total_cost")\
    .show(5)

In [None]:
x = spark.sql("""
    SELECT CustomerId, InvoiceDate, sum(UnitPrice * Quantity) as total_cost
    FROM retail_data
    GROUP BY CustomerId, InvoiceDate
""")
x.show(5)

In [None]:
streamingDataFrame = spark.readStream\
    .schema(staticSchema)\
    .option("maxFilesPerTrigger", 1)\
    .format("csv")\
    .option("header", "true")\
    .load("data/retail-data/by-day/*.csv")

In [None]:
streamingDataFrame.isStreaming

In [None]:
purchaseByCustomerPerHour = streamingDataFrame\
    .selectExpr(
        "CustomerId",
        "(UnitPrice * Quantity) as total_cost",
        "InvoiceDate")\
    .groupBy(
        col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
    .sum("total_cost")

In [None]:
purchaseByCustomerPerHour.writeStream\
    .format("memory")\
    .queryName("customer_purchases")\
    .outputMode("complete")\
    .start()

In [None]:
spark.sql("""
    SELECT * 
    FROM customer_purchases
    ORDER BY `sum(total_cost)` DESC
    """)\
    .show(5)

### MLLib

In [None]:
staticDataFrame.printSchema()

In [None]:
from pyspark.sql.functions import date_format, col 
preppedDataFrame = staticDataFrame\
    .na.fill(0)\
    .withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
    .coalesce(5)

In [None]:
preppedDataFrame.take(5)

In [None]:
trainDataFrame = preppedDataFrame\
    .where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
    .where("InvoiceDate >= '2011-07-01'")

In [None]:
print(trainDataFrame.count())
print(testDataFrame.count())

In [None]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer()\
    .setInputCol("day_of_week")\
    .setOutputCol("day_of_week_index")

In [None]:
from pyspark.ml.feature import OneHotEncoder 
encoder = OneHotEncoder()\
    .setInputCol("day_of_week_index")\
    .setOutputCol("day_of_week_encoded")

In [None]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler()\
    .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
    .setOutputCol("features")

In [None]:
from pyspark.ml import Pipeline 
transformationPipeline = Pipeline()\
    .setStages([indexer, encoder, vectorAssembler])

In [None]:
transformedTraining.cache()

In [None]:
fittedPipeline = transformationPipeline.fit(trainDataFrame)

In [None]:
transformedTraining = fittedPipeline.transform(trainDataFrame)

In [None]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans()\
    .setK(20)\
    .setSeed(1)

In [None]:
kmModel = kmeans.fit(transformedTraining)

In [None]:
kmModel.computeCost(transformedTraining)

In [None]:
transformedTest = fittedPipeline.transform(testDataFrame)

In [None]:
kmModel.computeCost(transformedTest)

### Lower Level Apis

In [1]:
from pyspark.sql import Row

spark.sparkContext.parallelize([Row(1), Row(2), Row(3)]).toDF()

DataFrame[_1: bigint]