In [1]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

df = spark.sql('''select 'spark' as hello ''')
df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



In [2]:
staticDataFrame = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("file:///home/sha/dev/books/Spark-The-Definitive-Guide/data/retail-data/by-day/*.csv")

In [3]:
staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema

In [4]:
from pyspark.sql.functions import window, column, desc, col
staticDataFrame\
    .selectExpr(
        "CustomerId",
        "(UnitPrice * Quantity) as total_cost",
        "InvoiceDate")\
    .groupBy(
        col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
    .sum("total_cost")\
    .show(5)

+----------+--------------------+-----------------+
|CustomerId|              window|  sum(total_cost)|
+----------+--------------------+-----------------+
|   16057.0|[2011-12-05 08:00...|            -37.6|
|   14126.0|[2011-11-29 08:00...|643.6300000000001|
|   13500.0|[2011-11-16 08:00...|497.9700000000001|
|   17160.0|[2011-11-08 08:00...|516.8499999999999|
|   15608.0|[2011-11-11 08:00...|            122.4|
+----------+--------------------+-----------------+
only showing top 5 rows



In [5]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [6]:
data_path = 'file:///home/sha/dev/books/Spark-The-Definitive-Guide/data'

In [7]:
streamingDataFrame = spark.readStream\
    .schema(staticSchema)\
    .option("maxFilesPerTrigger", 1)\
    .format("csv")\
    .option("header", "true")\
    .load(data_path + "/retail-data/by-day/*.csv")

In [8]:
purchaseByCustomerPerHour = streamingDataFrame\
    .selectExpr(
        "CustomerId",
        "(UnitPrice * Quantity) as total_cost",
        "InvoiceDate")\
    .groupBy(
        col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
    .sum("total_cost")

In [9]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [10]:
from pyspark.sql.functions import date_format, col

In [11]:
preppedDataFrame = staticDataFrame\
    .na.fill(0)\
    .withColumn('day_of_week', date_format(col('InvoiceDate'), 'EEEE'))\
    .coalesce(5)

In [12]:
preppedDataFrame

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string, day_of_week: string]

In [13]:
trainDataFrame = preppedDataFrame\
    .where('InvoiceDate < "2011-07-01"')
testDataFrame = preppedDataFrame\
    .where('InvoiceDate >= "2011-07-01"')

In [14]:
trainDataFrame.count()

245903

In [15]:
testDataFrame.count()

296006

In [16]:
from pyspark.ml.feature import StringIndexer

In [17]:
indexer = StringIndexer()\
    .setInputCol('day_of_week')\
    .setOutputCol('day_of_week_index')

In [18]:
from pyspark.ml.feature import OneHotEncoder

In [19]:
encoder = OneHotEncoder()\
    .setInputCol('day_of_week_index')\
    .setOutputCol('day_of_week_encoded')

In [20]:
from pyspark.ml.feature import VectorAssembler

In [21]:
vectorAssembler = VectorAssembler()\
    .setInputCols(['UnitPrice', 'Quantity', 'day_of_week_encoded'])\
    .setOutputCol('features')

In [22]:
from pyspark.ml import Pipeline

In [23]:
transformationPipeline = Pipeline()\
    .setStages([indexer, encoder, vectorAssembler])

In [24]:
fittedPipeline = transformationPipeline.fit(trainDataFrame)

In [25]:
transformedTraining = fittedPipeline.transform(trainDataFrame)

In [26]:
transformedTraining.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string, day_of_week: string, day_of_week_index: double, day_of_week_encoded: vector, features: vector]

In [27]:
from pyspark.ml.clustering import KMeans

In [29]:
kmeans = KMeans()\
    .setK(20)\
    .setSeed(1)

In [30]:
kmModel = kmeans.fit(transformedTraining)

In [31]:
kmModel.computeCost(transformedTraining)

84553739.96537484

In [32]:
transformedTest = fittedPipeline.transform(testDataFrame)
kmModel.computeCost(transformedTest)

517507094.72221166

In [33]:
from pyspark.sql import Row

In [34]:
spark.sparkContext.parallelize([Row(1), Row(2), Row(3)]).toDF()

DataFrame[_1: bigint]