In [1]:
"""
  spark-submit: 애플리케이션 코드를 클러스터에 전송하고, 실행시키는 역할을 한다.

"""

'\n  spark-submit: 애플리케이션 코드를 클러스터에 전송하고, 실행시키는 역할을 한다.\n\n'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [3]:
staticDataFrame = spark.read.format("csv") \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .load("data/retail-data/by-day/*.csv")

staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema

In [4]:
"""
  윈도우 함수(window function):
    집계 시에 시계열 컬럼을 기준으로 각 날짜에 대한 전체 데이터를 가지는 윈도우를 구성한다.
"""

from pyspark.sql.functions import window, col

staticDataFrame \
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate") \
  .groupby(
    col("CustomerId"), window(col("InvoiceDate"), "1 day")
  ) \
  .sum("total_cost") \
  .show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   13417.0|[2011-12-04 09:00...|            404.83|
|   15358.0|[2011-12-05 09:00...| 830.0600000000003|
|   15392.0|[2011-12-05 09:00...|304.40999999999997|
|   15290.0|[2011-12-05 09:00...|263.02000000000004|
|   16811.0|[2011-12-05 09:00...|             232.3|
+----------+--------------------+------------------+
only showing top 5 rows



In [5]:
# streaming

streamingDataFrame = spark.readStream \
  .schema(staticSchema) \
  .option("maxFilesPerTrigger", 1) \
  .format("csv") \
  .option("header", "true") \
  .load("data/retail-data/by-day/*.csv")

streamingDataFrame.isStreaming

True

In [6]:
purchaseByCustomerPerHour = streamingDataFrame \
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate") \
  .groupby(
    col("CustomerId"), window(col("InvoiceDate"), "1 day")
  ) \
  .sum("total_cost")

In [7]:
purchaseByCustomerPerHour.writeStream \
  .format("memory") \
  .queryName("customer_purchases") \
  .outputMode("complete") \
  .start()

<pyspark.sql.streaming.StreamingQuery at 0x10c719160>

In [9]:
spark.sql("""
  SELECT *
  FROM customer_purchases
  ORDER BY `sum(total_cost)` DESC
""") \
  .show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|      null|[2011-11-14 09:00...|          55316.08|
|      null|[2011-11-07 09:00...|          42939.17|
|      null|[2011-03-29 09:00...| 33521.39999999998|
|      null|[2011-12-08 09:00...|31975.590000000007|
|   18102.0|[2011-09-15 09:00...|31661.540000000005|
+----------+--------------------+------------------+
only showing top 5 rows



In [11]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [14]:
from pyspark.sql import functions as F

preppedDataFrame = staticDataFrame \
  .na.fill(0) \
  .withColumn("day_of_week", F.date_format(F.col("InvoiceDate"), "EEEE")) \
  .coalesce(5)

In [15]:
trainDataFrame = preppedDataFrame \
  .where("InvoiceDate < '2011-07-01'") \

testDataFrame = preppedDataFrame \
  .where("InvoiceDate >= '2011-07-01'") \

In [16]:
trainDataFrame.count(), testDataFrame.count()

(245903, 296006)

In [18]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer() \
  .setInputCol("day_of_week") \
  .setOutputCol("day_of_week_index")

In [19]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder() \
  .setInputCol("day_of_week_index") \
  .setOutputCol("day_of_week_encoded")

In [21]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler() \
  .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"]) \
  .setOutputCol("features")

In [23]:
from pyspark.ml import Pipeline

tranformationPipeline = Pipeline() \
  .setStages([indexer, encoder, vectorAssembler])

In [25]:
fittedPipeline = tranformationPipeline.fit("trainDataFrame")

AttributeError: 'str' object has no attribute '_jdf'