In [None]:
#create a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").\
                                     appName("spark_on_docker").\
                                     getOrCreate()

In [None]:
staticDataFrame = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("work/TheDefinitiveGuide/Spark-The-Definitive-Guide/data/retail-data/by-day/*.csv")

spark.conf.set("spark.sql.shuffle.partitions", "5")

staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema

In [None]:
from pyspark.sql.functions import window, column, desc, col
staticDataFrame\
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
  .sum("total_cost")\
  .sort(desc("sum(total_cost)"))\
  .show(5)


In [None]:
staticDataFrame.printSchema()

In [None]:
# data type covertion 
from pyspark.sql.functions import date_format, col

preppedDataFrame = staticDataFrame\
.na.fill(0)\
.withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
.coalesce(5) 

In [None]:
# Split the data into training and test sets
trainDataFrame = preppedDataFrame\
.where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
.where("InvoiceDate >= '2011-07-01'")

In [None]:
# Count the number of rows in the training and test sets            
print('trainDataFrame : ', trainDataFrame.count())
print('testDataFrame  : ', testDataFrame.count())

In [None]:
# pyspark also supports StringIndexer and OneHotEncoder
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer()\
.setInputCol("day_of_week")\
.setOutputCol("day_of_week_index")


In [None]:
# Create a OneHotEncoder to convert the day of week index to a vector
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder()\
.setInputCol("day_of_week_index")\
.setOutputCol("day_of_week_encoded")

In [None]:
# in Python
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler()\
.setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
.setOutputCol("features")

In [None]:
# three key features: the price, the quantity, and the day of week
from pyspark.ml import Pipeline
transformationPipeline = Pipeline()\
.setStages([indexer, encoder, vectorAssembler])


In [None]:
# need to fit our transformers to this dataset.
fittedPipeline = transformationPipeline.fit(trainDataFrame)

In [None]:
# After we fit the training data, 
# we are ready to take that fitted pipeline and 
# use it to transform all of our data in a consistent and repeatable way
transformedTraining = fittedPipeline.transform(trainDataFrame)

In [None]:
transformedTraining.cache()

In [None]:
# We now have a training set; it’s time to train the model. 
# First we’ll import the relevant model that we’d like to use and instantiate it:
from pyspark.ml.clustering import KMeans
kmeans = KMeans()\
  .setK(20)\
  .setSeed(1)



In [None]:
# naming pattern of Algorithm, for the untrained version
kmModel = kmeans.fit(transformedTraining)

In [None]:
# After we train this model, 
# we can compute the cost according to some success merits on our training set.


transformedTest = fittedPipeline.transform(testDataFrame)



In [None]:
spark.stop()