In [17]:
#create a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").\
                                     appName("spark_on_docker").\
                                     getOrCreate()

In [19]:
staticDataFrame = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("work/TheDefinitiveGuide/Spark-The-Definitive-Guide/data/retail-data/by-day/*.csv")

spark.conf.set("spark.sql.shuffle.partitions", "5")

staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema



In [20]:
from pyspark.sql.functions import window, column, desc, col
staticDataFrame\
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
  .sum("total_cost")\
  .sort(desc("sum(total_cost)"))\
  .show(5)




+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   17450.0|{2011-09-20 00:00...|          71601.44|
|      null|{2011-11-14 00:00...|          55316.08|
|      null|{2011-11-07 00:00...|          42939.17|
|      null|{2011-03-29 00:00...| 33521.39999999998|
|      null|{2011-12-08 00:00...|31975.590000000007|
+----------+--------------------+------------------+
only showing top 5 rows





In [21]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [44]:
# data type covertion 
from pyspark.sql.functions import date_format, col

preppedDataFrame = staticDataFrame\
.na.fill(0)\
.withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
.coalesce(5) 

In [45]:
# Split the data into training and test sets
trainDataFrame = preppedDataFrame\
.where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
.where("InvoiceDate >= '2011-07-01'")

In [46]:
# Count the number of rows in the training and test sets            
trainDataFrame.count()
testDataFrame.count()



296006

In [47]:
# pyspark also supports StringIndexer and OneHotEncoder
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer()\
.setInputCol("day_of_week")\
.setOutputCol("day_of_week_index")


In [48]:
# Create a OneHotEncoder to convert the day of week index to a vector
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder()\
.setInputCol("day_of_week_index")\
.setOutputCol("day_of_week_encoded")

In [55]:
# in Python
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler()\
.setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
.setOutputCol("features")

In [56]:
# three key features: the price, the quantity, and the day of week
from pyspark.ml import Pipeline
transformationPipeline = Pipeline()\
.setStages([indexer, encoder, vectorAssembler])


In [57]:
# need to fit our transformers to this dataset.
fittedPipeline = transformationPipeline.fit(trainDataFrame)



In [58]:
# After we fit the training data, 
# we are ready to take that fitted pipeline and 
# use it to transform all of our data in a consistent and repeatable way
transformedTraining = fittedPipeline.transform(trainDataFrame)

In [60]:
transformedTraining.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string, day_of_week: string, day_of_week_index: double, day_of_week_encoded: vector, features: vector]

In [64]:
# We now have a training set; it’s time to train the model. 
# First we’ll import the relevant model that we’d like to use and instantiate it:
from pyspark.ml.clustering import KMeans
kmeans = KMeans()\
  .setK(20)\
  .setSeed(1)



In [65]:
# naming pattern of Algorithm, for the untrained version
kmModel = kmeans.fit(transformedTraining)



In [69]:
# After we train this model, 
# we can compute the cost according to some success merits on our training set.


transformedTest = fittedPipeline.transform(testDataFrame)



In [70]:
spark.stop()