# 6장. 다양한 데이터 타입 다루기

In [1]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("file:///home/ubuntu/ybigta/Dataset_spark/data/retail-data/by-day/2010-12-01.csv")
    
# 스키마 정보 출력
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



## 6.1 > spark 데이터 타입으로 변환  : lit()

In [2]:
from pyspark.sql.functions import lit

df.select(lit(5), lit("five"), lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

## 6.2 > Boolean 데이터 다루기

In [3]:
from pyspark.sql.functions import col

       #where로 로우를 제한        #select로 컬럼을 제한     #False 옵션을 주면 데이터를 줄이지 않고 보여줌
df.where(col("InvoiceNo") != 536365)\
.select("InvoiceNo", "Description")\
.show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



In [4]:
# == , =  모두 두 인자가 같은지 비교할 때 쓰임
df.where("InvoiceNo = 536365")\
.show(2)

df.where("InvoiceNo == 536365")\
.show(2)

# <> , !=  모두 두 인자가 다른지 비교할 때 쓰임
df.where("InvoiceNo <> 536365")\
.show(2)

df.where("InvoiceNo != 536365")\
.show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|  

- and 와 or 
    - and 는 모든 표현식을 차례로 필터를 적용해야함  > where().where().where()
    - or 는 반드시 동일한 구문에 조건을 정의해야함   > where(a | b)

In [5]:
# 문자열 속 문자열 찾기  > instr(df.Description, "POSTAGE")  : df.Description에서 "POSTAGE" 위치 리턴
from pyspark.sql.functions import instr, expr

priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [6]:
# 불리언 컬럼을 통해 필터링 가능
     #불리언컬럼 생성              #불리언컬럼으로 필터링
df.withColumn("isExpensive", priceFilter | descripFilter)\
.where("isExpensive").where("StockCode = 'DOT'")\
.show(5)

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|isExpensive|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|       true|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|       true|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+



## 6.3 > 수치형 데이터 다루기

In [7]:
from pyspark.sql.functions import expr, pow

fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)
#alias() : dataframe 복제 : 하나의 열일때 naming 가능

df.select(expr("CustomerId"), expr("(POWER((Quantity * UnitPrice), 2.0) + 5) as Real")).show(2)

df.selectExpr("CustomerId", "(POWER((Quantity * UnitPrice), 2.0) + 5) as Real").show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows

+----------+------------------+
|CustomerId|              Real|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows

+----------+------------------+
|CustomerId|              Real|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



- 반올림 round  / 내림 : bround

In [8]:
from pyspark.sql.functions import lit, round, bround

df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [9]:
from pyspark.sql.functions import corr, count, mean, stddev_pop, min, max      #다양한 function 지원

df.stat.corr("Quantity", "UnitPrice")
# df.select(corr("Quantity", "UnitPrice")).show()

# 기초 통계량 확인 가능
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                null| 8.627413127413128| 4.151946589446603|15661.388719512195|          null|
| stddev|72.89447869788873|17407.897548583845|                null|26.371821677029203|15.638659854603892|1854.4496996893627|          null|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

- StatFunction 패키지에서 다양한 통계함수 지원 (stat속성 사용)

In [10]:
from pyspark.sql.functions import monotonically_increasing_id

quantileProbs = [0.5]
relError = 0.05

df.stat.approxQuantile("UnitPrice", quantileProbs, relError)   #백분위수 계산

df.stat.crosstab("StockCode", "Quantity").show()

df.stat.freqItems(["StockCode", "Quantity"]).show()

df.select(monotonically_increasing_id()).show(2)

+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|StockCode_Quantity| -1|-10|-12| -2|-24| -3| -4| -5| -6| -7|  1| 10|100| 11| 12|120|128| 13| 14|144| 15| 16| 17| 18| 19|192|  2| 20|200| 21|216| 22| 23| 24| 25|252| 27| 28|288|  3| 30| 32| 33| 34| 36|384|  4| 40|432| 47| 48|480|  5| 50| 56|  6| 60|600| 64|  7| 70| 72|  8| 80|  9| 96|
+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|             22578|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0| 

## 6.4 > 문자열 데이터 타입 다루기

데이터 추출, 데이터 치환, 문자열 존재 여부, 대/소문자 변환

- 대/소문자 변환

In [11]:
from pyspark.sql.functions import initcap   #공백으로 구분되는 모든 단어의 첫글자를 대문자로 변경

df.select(initcap("Description")).show(2, False)

+----------------------------------+
|initcap(Description)              |
+----------------------------------+
|White Hanging Heart T-light Holder|
|White Metal Lantern               |
+----------------------------------+
only showing top 2 rows



In [12]:
from pyspark.sql.functions import lower, upper     #전체를 대/소문자로 변형

df.select("Description", lower(col("Description")), upper(col("Description"))).show(2)
# 함수 안에 col형태로 주어져야함

+--------------------+--------------------+--------------------+
|         Description|  lower(Description)|  upper(Description)|
+--------------------+--------------------+--------------------+
|WHITE HANGING HEA...|white hanging hea...|WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern| WHITE METAL LANTERN|
+--------------------+--------------------+--------------------+
only showing top 2 rows



- 문자열 주변 공백 제거 및 추가

In [13]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim

df.select(ltrim(lit("      Hello        ")).alias("ltrim"),
         rtrim(lit("      Hello        ")).alias("rtrim"),
         trim(lit("      Hello        ")).alias("trim"),
         lpad(lit("Hello"), 10, " ").alias("lpad"), 
         rpad(lit("Hello"), 10, "@").alias("rpad")).show(2)
#trim : 공백제거
#pad  : 주어진 인자로 채우기

+-------------+-----------+-----+----------+----------+
|        ltrim|      rtrim| trim|      lpad|      rpad|
+-------------+-----------+-----+----------+----------+
|Hello        |      Hello|Hello|     Hello|Hello@@@@@|
|Hello        |      Hello|Hello|     Hello|Hello@@@@@|
+-------------+-----------+-----+----------+----------+
only showing top 2 rows



- 정규표현식
    - 문자열의 존재 여부, 일치하는 모든 문자열 치환에 정규 표현식이 사용됨
    - regexp_extract()
    - regexp_replace()

In [14]:
from pyspark.sql.functions import regexp_replace
# 정규표현식을 이용한 문자 치환
regexp_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(regexp_replace(col("Description"), regexp_string, "COLOR").alias("color_clean"), col("Description")).show(2)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



In [15]:
from pyspark.sql.functions import translate

df.select(translate(col("Description"), "LEET", "1337"), col("Description")).show(2)
# 해당하는 모든 문자 치환

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



In [16]:
from pyspark.sql.functions import regexp_extract
# 데이터 추출 : regexp_extract
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(regexp_extract(col("Description"), extract_str, 1).alias("color_clean"), col("Description")).show(2)

+-----------+--------------------+
|color_clean|         Description|
+-----------+--------------------+
|      WHITE|WHITE HANGING HEA...|
|      WHITE| WHITE METAL LANTERN|
+-----------+--------------------+
only showing top 2 rows



In [17]:
# 포함여부는 instr(위치 인덱스 리턴)로 확인 가능 > 조건을 걸어서 필터링
from pyspark.sql.functions import instr

containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1
df.withColumn("hasSimpleColor", containsBlack|containsWhite)\
.where("hasSimpleColor")\
.select("Description").show(3, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



- 인수가 동적으로 변하는 상황

In [18]:
# locate() :                                                                                ㄷㄷㄷㅈ??!
from pyspark.sql.functions import expr, locate

simpleColors = ["black", "white", "red", "green", "blue"]

def color_locator(column, color_string):
    return locate(color_string.upper(), column)\
        .cast("boolean")\
        .alias("is_" + color_string)
        
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*"))

df.select(*selectedColumns).where(expr("is_white OR is_red"))\
    .select("Description").show(3, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



## 6.5 > 날짜와 타임스탬프 데이터 타입 다루기

-날짜 -타임스탬프 형태로 데이터를 다룸 <br>
TimestampType : 초단위의 정밀도<br>
Long          : 그 이상의 정밀도를 원한다면!

In [19]:
# 현재 시간 : current_timestamp       현재 날짜 : current_date
from pyspark.sql.functions import current_date, current_timestamp

dateDF = spark.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())

dateDF.createOrReplaceTempView("dateTable")                                           #얘는 도대체 뭐하는 녀석이지?
dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



- 날짜 더하기 빼기
    - date_sub
    - date_add

In [20]:
from pyspark.sql.functions import date_add, date_sub

dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2020-04-27|        2020-05-07|
+------------------+------------------+
only showing top 1 row



- 두 날짜 사이 일수 반환
    - datediff
    - months_between
<br><br>
- to_date : 문자열을 날짜로 변환, 필요에 따라 날짜 포맷(java의 SimpleDateFormat)도 지정 가능

In [21]:

from pyspark.sql.functions import datediff, months_between, to_date

dateDF.withColumn("week_age", date_sub(col("today"), 7))\
.select(datediff(col("week_age"), col("today"))).show(1)

dateDF.select(to_date(lit("2018-04-10")).alias("start"),  to_date(lit("2020-05-01")).alias("end"))\
.select(months_between(col("start"), col("end"))).show(1)

+-------------------------+
|datediff(week_age, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row

+--------------------------+
|months_between(start, end)|
+--------------------------+
|              -24.70967742|
+--------------------------+
only showing top 1 row



In [22]:
from pyspark.sql.functions import to_date, lit

spark.range(5).withColumn("date", lit("2020-04-10"))\
.select(to_date(col("date"))).show(1)

+---------------+
|to_date(`date`)|
+---------------+
|     2020-04-10|
+---------------+
only showing top 1 row



to_date는 포맷을 맞추지 않으면 null을 리턴<br>
그렇기에 to_timestamp 사용

In [25]:
from pyspark.sql.functions import to_date, to_timestamp

dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(to_date(lit("2017-12-11")).alias("date"), 
                                   to_date(lit("2017-20-12")).alias("date2"))

cleanDateDF.createOrReplaceTempView("dateTable2")

cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show()

+----------------------------------+
|to_timestamp(`date`, 'yyyy-dd-MM')|
+----------------------------------+
|               2017-12-11 00:00:00|
+----------------------------------+



In [28]:
# 날짜도 스파크 리터럴로 바뀐 날짜와 비교가 가능!
cleanDateDF.filter(col("date") < lit("2017-12-12")).show()

+----------+-----+
|      date|date2|
+----------+-----+
|2017-12-11| null|
+----------+-----+



## 6.6 > null 값 다루기

스파크는 null값 허용을 강제할 수는 없다. <br>
하지만 null이 없어야할 컬럼에 null이 존재하면 특이한 에러가 발생할 수가 있다. <br><br>
DataFrame의 null을 다루기 위해서는 .na 아래에서 function 사용

- null값을 다른값으로 채워넣거나
- null값을 제거하는 방법

#### 1. coalesce : 인수로 지정한 여러 컬럼 중 null 이 아닌 첫번째 값을 반환

In [31]:
from pyspark.sql.functions import coalesce

df.select(coalesce(col("Description"), col("CustomerId"))).show(5, False)

+-----------------------------------+
|coalesce(Description, CustomerId)  |
+-----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |
|WHITE METAL LANTERN                |
|CREAM CUPID HEARTS COAT HANGER     |
|KNITTED UNION FLAG HOT WATER BOTTLE|
|RED WOOLLY HOTTIE WHITE HEART.     |
+-----------------------------------+
only showing top 5 rows



#### 2. ifnull, nullif, nvl, vnl2 ( sql)

#### 3. drop : null을 가진 모든 row 제거

In [38]:
df.na.drop()
df.na.drop("any")                                       # 적어도 하나의 컬럼이 null인 경우 제거
df.na.drop("all", subset=["StockCode", "InvoiceNo"])    # 모든 컬럼이 null인 row만 제거

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

#### 4. fill : 특정값으로 채우기

In [40]:
# df.na.fill(5:Integer)
df.na.fill("all", subset=["StockCode", "InvoiceNo"])

fill_cols_vals = {"StockCode": 5, "Description": "No Value"}
df.na.fill(fill_cols_vals)

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

#### 5. replace : 조건에 따라 유연하게 대체 (원래 값과 변경하고 자하는 데이터 타입이 같아야함)

In [42]:
# df.na.replace(기존값, 대체값, 타겟컬럼)
df.na.replace([""], ["UNKNOWN"], "Description")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

## 6.7 > 정렬하기

- null 값 표시 기준 지정 함수
    - asc_nulls_first
    - desc_nulls_first
    - asc_nulls_last
    - desc_nulls_last

## 6.8 > 복합데이터 다루기 : 구조체, 배열, 맵

#### 1. 구조체

In [43]:
# 구조체형 데이터 타입 : sturct
from pyspark.sql.functions import struct

complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [47]:
# . 혹은 getField를 통해 구조체 내부 데이터에 접근
complexDF.select("complex.Description").show(2, False)
complexDF.select(col("complex").getField("Description")).show(2, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
+----------------------------------+
only showing top 2 rows

+----------------------------------+
|complex.Description               |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
+----------------------------------+
only showing top 2 rows



#### 2. 배열

- split : 배열화 하여 저장 > list
- size  : 배열 크기 > int
- array_contains : 특정값 존재 > true / false
- explode : 입력된 배열 컬럼에 포함된 모든 값을 로우로 변환 나머지 컬럼은 중복되어 표시

In [51]:
from pyspark.sql.functions import split

df.select(split(col("Description"), " ")).show(2)

df.select(split(col("Description"), " ").alias("array_col"))\
.selectExpr("array_col[0]", "array_col[1]").show(2)

+---------------------+
|split(Description,  )|
+---------------------+
| [WHITE, HANGING, ...|
| [WHITE, METAL, LA...|
+---------------------+
only showing top 2 rows

+------------+------------+
|array_col[0]|array_col[1]|
+------------+------------+
|       WHITE|     HANGING|
|       WHITE|       METAL|
+------------+------------+
only showing top 2 rows



In [52]:
from pyspark.sql.functions import size

df.select(size(split(col("Description"), " "))).show(3)

+---------------------------+
|size(split(Description,  ))|
+---------------------------+
|                          5|
|                          3|
|                          5|
+---------------------------+
only showing top 3 rows



In [53]:
from pyspark.sql.functions import array_contains

df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

+--------------------------------------------+
|array_contains(split(Description,  ), WHITE)|
+--------------------------------------------+
|                                        true|
|                                        true|
+--------------------------------------------+
only showing top 2 rows



In [56]:
from pyspark.sql.functions import split, explode

df.withColumn("splitted", split(col("Description"), " "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Description", "InvoiceNo", "exploded").show()

+--------------------+---------+--------+
|         Description|InvoiceNo|exploded|
+--------------------+---------+--------+
|WHITE HANGING HEA...|   536365|   WHITE|
|WHITE HANGING HEA...|   536365| HANGING|
|WHITE HANGING HEA...|   536365|   HEART|
|WHITE HANGING HEA...|   536365| T-LIGHT|
|WHITE HANGING HEA...|   536365|  HOLDER|
| WHITE METAL LANTERN|   536365|   WHITE|
| WHITE METAL LANTERN|   536365|   METAL|
| WHITE METAL LANTERN|   536365| LANTERN|
|CREAM CUPID HEART...|   536365|   CREAM|
|CREAM CUPID HEART...|   536365|   CUPID|
|CREAM CUPID HEART...|   536365|  HEARTS|
|CREAM CUPID HEART...|   536365|    COAT|
|CREAM CUPID HEART...|   536365|  HANGER|
|KNITTED UNION FLA...|   536365| KNITTED|
|KNITTED UNION FLA...|   536365|   UNION|
|KNITTED UNION FLA...|   536365|    FLAG|
|KNITTED UNION FLA...|   536365|     HOT|
|KNITTED UNION FLA...|   536365|   WATER|
|KNITTED UNION FLA...|   536365|  BOTTLE|
|RED WOOLLY HOTTIE...|   536365|     RED|
+--------------------+---------+--

#### 3. 맵

In [59]:
from pyspark.sql.functions import create_map

df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map")).show(2, False)

# key값으로 조회 가능
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

+----------------------------------------------+
|complex_map                                   |
+----------------------------------------------+
|[WHITE HANGING HEART T-LIGHT HOLDER -> 536365]|
|[WHITE METAL LANTERN -> 536365]               |
+----------------------------------------------+
only showing top 2 rows

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            null|
|                          536365|
+--------------------------------+
only showing top 2 rows



In [61]:
# map을 분리하여 컬럼으로 변환가능 ( by explode 함수)
df.select(create_map(col("description"), col("InvoiceNo")).alias("complex_map"))\
.selectExpr("explode(complex_map)").show(2)

+--------------------+------+
|                 key| value|
+--------------------+------+
|WHITE HANGING HEA...|536365|
| WHITE METAL LANTERN|536365|
+--------------------+------+
only showing top 2 rows



## 6.9 JSON 다루기!!!

- JSON 컬럼 생성

In [63]:
jsonDF = spark.range(1).selectExpr("""
'{"myJSONKey":{"myJSONValue":[1,2,3]}}' as jsonString""")

In [65]:
from pyspark.sql.functions import get_json_object, json_tuple

jsonDF.select(get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias("column_objects"),
             json_tuple(col("jsonString"), "myJSONKey").alias("column_tuple")).show(2)

+--------------+--------------------+
|column_objects|        column_tuple|
+--------------+--------------------+
|             2|{"myJSONValue":[1...|
+--------------+--------------------+



- struct 를 JSON으로    : to_json

In [66]:
from pyspark.sql.functions import to_json

df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct"))).show(3)

+-----------------------+
|structstojson(myStruct)|
+-----------------------+
|   {"InvoiceNo":"536...|
|   {"InvoiceNo":"536...|
|   {"InvoiceNo":"536...|
+-----------------------+
only showing top 3 rows



- JSON을 다시 객체로    : from_json
    - 스키마를 파라미터로 제공해야함...

In [68]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import *

parseSchema = StructType((
StructField("InvoiceNo", StringType(), True),
StructField("Description", StringType(), True)))

df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")).alias("newJSON"))\
.select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2)

+----------------------+--------------------+
|jsontostructs(newJSON)|             newJSON|
+----------------------+--------------------+
|  [536365, WHITE HA...|{"InvoiceNo":"536...|
|  [536365, WHITE ME...|{"InvoiceNo":"536...|
+----------------------+--------------------+
only showing top 2 rows



## 6.10 > 사용자 정의함수 UDF

1_ 사용자 정의함수 정의


In [69]:
def myfunc(a):
    return a*3

2_ 사용자 정의함수 등록 > DataFrame

In [70]:
from pyspark.sql.functions import udf

myfunc_udf = udf(myfunc)

3_ 함수 사용

In [72]:
from pyspark.sql.functions import col

udfExampleDF = spark.range(5).toDF("num")
udfExampleDF.select(myfunc_udf(col("num"))).show(2)

+-----------+
|myfunc(num)|
+-----------+
|          0|
|          3|
+-----------+
only showing top 2 rows



2_2 사용자 정의함수 등록 > sql 

In [81]:
from pyspark.sql.types import IntegerType, DoubleType

# 올바른 함수 작동을 위해 타입지정 필요 > 타입이 맞지 않으면 null을 리턴
spark.udf.register("myfuncpy", myfunc, DoubleType())
# spark.udf.register("myfuncpy", myfunc, IntegerType())

<function __main__.myfunc>

3_2 사용자 정의함수 사용 > sql

In [82]:
udfExampleDF.selectExpr("myfuncpy(num)").show(2)

+-------------+
|myfuncpy(num)|
+-------------+
|         null|
|         null|
+-------------+
only showing top 2 rows

