In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON

spark = (
    SparkSession
    .builder
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .getOrCreate()
)

# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
spark

In [10]:
# dictionary = spark.conf.get("parquet.enable.dictionary")
from pyspark.context import SparkContext as sc

for conf in spark.sparkContext.getConf().getAll():
    print(conf)



('spark.sql.session.timeZone', 'Asia/Seoul')
('spark.driver.port', '34951')
('spark.executor.id', 'driver')
('spark.app.name', 'pyspark-shell')
('spark.app.id', 'local-1653614399786')
('spark.driver.extraJavaOptions', '-Dio.netty.tryReflectionSetAccessible=true')
('spark.app.startTime', '1653614398910')
('spark.rdd.compress', 'True')
('spark.sql.warehouse.dir', 'file:/home/jovyan/work/example/spark-warehouse')
('spark.serializer.objectStreamReset', '100')
('spark.driver.host', '8fe632226952')
('spark.master', 'local[*]')
('spark.submit.pyFiles', '')
('spark.submit.deployMode', 'client')
('spark.executor.extraJavaOptions', '-Dio.netty.tryReflectionSetAccessible=true')
('spark.ui.showConsoleProgress', 'true')


In [3]:
""" union 함수 """
from pyspark.sql import Row

schema = StructType([
    StructField("source", StringType(), False),
    StructField("target", StringType(), False),
    StructField("visit", IntegerType(), False),
])
newRows = [
    Row("New Country", "Other Country", 5),
    Row("New Country 2", "Other Country 3", 1)
]

# Parallelized Collections :
# Parallelized collections are created by calling SparkContext’s parallelize method on an existing iterable or collection in your driver program.
# The elements of the collection are copied to form a distributed dataset that can be operated on in parallel. 

parallelizedRows = spark.sparkContext.parallelize(newRows) 
newDF = spark.createDataFrame(parallelizedRows, schema)

newDF.show()

+-------------+---------------+-----+
|       source|         target|visit|
+-------------+---------------+-----+
|  New Country|  Other Country|    5|
|New Country 2|Other Country 3|    1|
+-------------+---------------+-----+



In [4]:
newDF.write.mode("overwrite").parquet("foo/parquet")

In [10]:
foo = spark.read.parquet("foo/parquet")
foo.printSchema()
foo.show(10, truncate=False)
foo.explain(mode='extended')

root
 |-- source: string (nullable = true)
 |-- target: string (nullable = true)
 |-- visit: integer (nullable = true)

+-------------+---------------+-----+
|source       |target         |visit|
+-------------+---------------+-----+
|New Country 2|Other Country 3|1    |
|New Country  |Other Country  |5    |
+-------------+---------------+-----+

== Parsed Logical Plan ==
Relation [source#74,target#75,visit#76] parquet

== Analyzed Logical Plan ==
source: string, target: string, visit: int
Relation [source#74,target#75,visit#76] parquet

== Optimized Logical Plan ==
Relation [source#74,target#75,visit#76] parquet

== Physical Plan ==
*(1) ColumnarToRow
+- FileScan parquet [source#74,target#75,visit#76] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/work/foo/parquet], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<source:string,target:string,visit:int>



In [8]:
column_pruning = spark.read.parquet("foo/parquet")
column_pruning.select("visit").explain(mode='extended')

== Parsed Logical Plan ==
'Project ['visit]
+- Relation [source#48,target#49,visit#50] parquet

== Analyzed Logical Plan ==
visit: int
Project [visit#50]
+- Relation [source#48,target#49,visit#50] parquet

== Optimized Logical Plan ==
Project [visit#50]
+- Relation [source#48,target#49,visit#50] parquet

== Physical Plan ==
*(1) ColumnarToRow
+- FileScan parquet [visit#50] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/work/foo/parquet], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<visit:int>



In [11]:
filter_pushdown = spark.read.parquet("foo/parquet")
filter_pushdown.where("visit > 100").explain(mode='extended')

== Parsed Logical Plan ==
'Filter ('visit > 100)
+- Relation [source#93,target#94,visit#95] parquet

== Analyzed Logical Plan ==
source: string, target: string, visit: int
Filter (visit#95 > 100)
+- Relation [source#93,target#94,visit#95] parquet

== Optimized Logical Plan ==
Filter (isnotnull(visit#95) AND (visit#95 > 100))
+- Relation [source#93,target#94,visit#95] parquet

== Physical Plan ==
*(1) Filter (isnotnull(visit#95) AND (visit#95 > 100))
+- *(1) ColumnarToRow
   +- FileScan parquet [source#93,target#94,visit#95] Batched: true, DataFilters: [isnotnull(visit#95), (visit#95 > 100)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/work/foo/parquet], PartitionFilters: [], PushedFilters: [IsNotNull(visit), GreaterThan(visit,100)], ReadSchema: struct<source:string,target:string,visit:int>

