In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Data Engineer Intermediate Day9") \
    .config("spark.dataengineer.intermediate.day9", "exercise-lag") \
    .getOrCreate()

spark.sparkContext.getConf().getAll() 

[('spark.dataengineer.intermediate.day9', 'exercise-9'),
 ('spark.app.id', 'local-1598590335405'),
 ('spark.driver.port', '43487'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', 'bcfc9e857123'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'Data Engineer Intermediate Day9')]

In [3]:
""" DataFrame 생성 """
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("data/retail-data/by-day/2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("retail")
df.show(5)

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   8

In [4]:
from pyspark.sql.functions import *
df.groupBy("StockCode").count().orderBy(desc("count")).show(5)

+---------+-----+
|StockCode|count|
+---------+-----+
|    22632|   20|
|    22866|   19|
|   85123A|   17|
|    22865|   15|
|    22961|   14|
+---------+-----+
only showing top 5 rows



In [5]:
small = df.where(expr("StockCode in ('22632', '22866')")).orderBy("Quantity")
small.show(10)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|  C536543|    22632|HAND WARMER RED R...|      -1|2010-12-01 14:30:00|      2.1|   17841.0|United Kingdom|
|   536409|    22866|HAND WARMER SCOTT...|       1|2010-12-01 11:45:00|      2.1|   17908.0|United Kingdom|
|   536522|    22866|HAND WARMER SCOTT...|       1|2010-12-01 12:49:00|      2.1|   15012.0|United Kingdom|
|   536522|    22632|HAND WARMER RED R...|       1|2010-12-01 12:49:00|      2.1|   15012.0|United Kingdom|
|   536409|    22866|HAND WARMER SCOTT...|       1|2010-12-01 11:45:00|      2.1|   17908.0|United Kingdom|
|   536464|    22866|HAND WARMER SCOTT...|       1|2010-12-01 12:23:00|      2.1|   17968.0|United Kingdom|
|   536464|    22866|HAND WA

In [6]:
from pyspark.sql.window import Window

dfLag = small.withColumn('PrevQuantity', lag(df['Quantity']).over(Window.partitionBy("StockCode").orderBy(desc("Quantity"))))
x = dfLag.withColumn('LagQuantity', dfLag['PrevQuantity'] - dfLag['Quantity'])

x.show(10)
# x.fillna({'LagQuantity':0}).show(10)  # LagQuantity 컬럼에 대해서만 null 을  0으로 치환할 수 있습니다.

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|PrevQuantity|LagQuantity|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+------------+-----------+
|   536394|    22632|HAND WARMER RED R...|      96|2010-12-01 10:39:00|     1.85|   13408.0|United Kingdom|        null|       null|
|   536567|    22632|HAND WARMER RED R...|      24|2010-12-01 15:27:00|      2.1|   16048.0|United Kingdom|          96|         72|
|   536398|    22632|HAND WARMER RED R...|      12|2010-12-01 10:52:00|      2.1|   13448.0|United Kingdom|          24|         12|
|   536423|    22632|HAND WARMER RED R...|      12|2010-12-01 12:08:00|      2.1|   18085.0|United Kingdom|          12|          0|
|   536477|    22632|HAND WARMER RED R...|      12|2010-12-01 12:27:0