In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
df = spark.read.format("json") \
  .load("data/flight-data/json/2015-summary.json")

In [3]:
df. printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [4]:
# Example of Schema on Read(data source defines a schema)

spark.read.format("json") \
  .load("data/flight-data/json/2015-summary.json") \
  .schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

In [5]:
# Example of reading data with schema we defined already

from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False, metadata={"hello":"world"})
])

df = spark.read.format("json") \
  .schema(myManualSchema) \
  .load("data/flight-data/json/2015-summary.json")

In [None]:
# Columns and Expressions

# cannot manipulate an individual column outside the context of a DataFrame
# must use Spark transformations to modify a column


In [7]:
# How to make columns: col or column functions

from pyspark.sql.functions import col, column

col("someColumnName")
column("someColumnName")

# Columns are not resolved until we compare the column names 
# with those we are maintaining in the catalog

# Explicit column references
# useful for a join
# analyzer 단계에서 resolve 하는 것을 생략한다.

df.col("count")

AttributeError: 'DataFrame' object has no attribute 'col'

In [None]:
# Experssions
# a set of transformations on one or more values in a record in a DataFrame.
# It's like a function that takes as input one or more columns names 

In [9]:
# accessing a dataframe's columns
spark.read.format("json") \
  .load("data/flight-data/json/2015-summary.json") \
  .columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [None]:
# Row 

## Creating Rows 


In [11]:
# create a dataframe
df = spark.read.format("json") \
  .load("data/flight-data/json/2015-summary.json")

# register df to temp view
df.createOrReplaceTempView("dfTable")

# create a dataframe with custom rows and schema
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False, metadata={"hello":"world"})
])
myRow = Row("Hello", None, 1)
myDf = spark.createDataFrame([myRow], myManualSchema)
myDf.show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|            Hello|               null|    1|
+-----------------+-------------------+-----+



In [18]:
# select and selectExpr

df.select("DEST_COUNTRY_NAME").show(2)

df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(2)

from pyspark.sql.functions import expr, col, column

df.select(
  expr("DEST_COUNTRY_NAME"),
  col("DEST_COUNTRY_NAME"),
  column("DEST_COUNTRY_NAME"))\
  .show(2)

# Error -> not Error
df.select(col("DEST_COUNTRY_NAME"), "DEST_COUNTRY_NAME")

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
+-----------------+-------------------+
only showing top 2 rows

+-----------------+-----------------+-----------------+
|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|
+-----------------+-----------------+-----------------+
|    United States|    United States|    United States|
|    United States|    United States|    United States|
+-----------------+-----------------+-----------------+
only showing top 2 rows



DataFrame[DEST_COUNTRY_NAME: string, DEST_COUNTRY_NAME: string]

In [5]:
from pyspark.sql import functions as F

df.select(F.col("DEST_COUNTRY_NAME"), "DEST_COUNTRY_NAME").show(2)

+-----------------+-----------------+
|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|
+-----------------+-----------------+
|    United States|    United States|
|    United States|    United States|
+-----------------+-----------------+
only showing top 2 rows



In [5]:
# Converting to Spark Types(Literals)
# a translation from a given programming language's literal value to one that Spark understands.
# Literals are expressions.

from pyspark.sql import functions as F

df.select(F.expr("*"), F.lit(1).alias("One")).show(2)


+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
+-----------------+-------------------+-----+---+
only showing top 2 rows



In [9]:
# Adding Columns
# withColumn method: adding a new column to a DataFrame.
# withColumn takes two arguments: (column_name: string, expression: expression)


df.withColumn("numberOne", F.lit(1)).show(2)

df.withColumn("withinCountry", F.expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME")) \
  .show(2)

# Renaming a column by using withColumn
df.withColumn("Destination", F.expr("DEST_COUNTRY_NAME")).columns

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count', 'Destination']

In [11]:
# Renaming Columns
# withColumnRenamed method
# withColumnRenamed() takes two arguments: (existing: string, new: string)

df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

['dest', 'ORIGIN_COUNTRY_NAME', 'count']

In [15]:
# Reserved Characters and Keywords
# reserved characters: ' ', '-' etc
# keywords: time, date etc
# use backtick(`)

dfWithLongColName = df.withColumn(
    "This Long Column-Name",
    F.expr("ORIGIN_COUNTRY_NAME"))

dfWithLongColName.selectExpr(
    "`This Long Column-Name`",
    "`This Long Column-Name` as `new col`") \
    .show(2)

dfWithLongColName.select(F.expr("`This Long Column-Name`")).columns

+---------------------+-------+
|This Long Column-Name|new col|
+---------------------+-------+
|              Romania|Romania|
|              Croatia|Croatia|
+---------------------+-------+
only showing top 2 rows



['This Long Column-Name']

In [17]:
# Case Sensitivity
# default: case insensitivity

# set spark.sql.caseSensitive true

In [20]:
# Removing Columns
# drop method takes multiple arguements: (*cols)

print(df.drop("ORIGIN_COUNTRY_NAME").columns)

print(dfWithLongColName.drop("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").columns)
dfWithLongColName.columns

['DEST_COUNTRY_NAME', 'count']
['count', 'This Long Column-Name']


['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count', 'This Long Column-Name']

In [32]:
# Changing a Column's Type(cast)

df.withColumn("count2", F.col("count").cast("long"))

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint, count2: bigint]

In [35]:
# Filtering Rows

df.filter(F.col("count") < 2).show(2)
df.where("count < 2").show(2)

df.where(F.col("count") < 2).where(F.col("ORIGIN_COUNTRY_NAME") != "Croatia") \
  .show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [34]:
# Getting Unique Rows

df.select()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [44]:
# Random Samples

seed = 5
withReplacement = False
fraction = 0.5

df.sample(withReplacement, fraction, seed).count()

126

In [45]:
# Random Splits

seed = 5
dfs = df.randomSplit([0.25, 0.75], seed)
dfs[0].count() > dfs[1].count()

False

In [49]:
# Concatenating and Appending Rows(Union)
from pyspark.sql import Row
schema = df.schema
newRows = [
    Row("New Country", "Other Country", 5),
    Row("New Country 2", "Other Country 3", 1)
]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)

df.union(newDF) \
  .where("count = 1") \
  .where(F.col("ORIGIN_COUNTRY_NAME") != "United States") \
  .show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|    United States|          Gibraltar|    1|
|    United States|             Cyprus|    1|
|    United States|            Estonia|    1|
|    United States|          Lithuania|    1|
|    United States|           Bulgaria|    1|
|    United States|            Georgia|    1|
|    United States|            Bahrain|    1|
|    United States|   Papua New Guinea|    1|
|    United States|         Montenegro|    1|
|    United States|            Namibia|    1|
|    New Country 2|    Other Country 3|    1|
+-----------------+-------------------+-----+



In [36]:
# Sorting Rows
# sort and orderBy are exactly same method
# asc, desc, asc_nulls_first, desc_nulls_first, asc_nulls_last, desc_nulls_last
# sortWithinPartitions


df.sort("count").show(5)
df.orderBy("count", "DEST_COUNTRY_NAME").show(5)
df.orderBy(F.col("count"), F.col("DEST_COUNTRY_NAME")).show(5)

df.orderBy(F.expr("count desc")).show(2)
df.orderBy(F.col("count").desc(), F.col("DEST_COUNTRY_NAME").asc()).show(2)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--

In [37]:
# Limit

df.limit(5).show()
df.orderBy(F.expr("count desc")).limit(6).show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
|             Moldova|      United States|    1|
+--------------------+-------------------+-----+



In [41]:
# Repartition and Coalesce
# Repartition will incur a full shuffle
# 특정 column을 기준으로 자주 filtering 한다면, 그 column을 기준으로 repartition을 해라.
# coalesce : 전체 파티션을 셔플하지 않고, 파티션을 병합하려는 경우에 사용.

df.rdd.getNumPartitions()

df.repartition(5)


# 목적지를 기준으로 5개의 파티션으로 나누고, 셔플 없이 2개의 파티션으로 병합
df.repartition(5, F.col("DEST_COUNTRY_NAME")).coalesce(2)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [43]:
# Collecting Rows to the Driver
# collect(), take(), show()
# 로컬에서 처리하기 위해서는 driver로 데이터를 수집해야한다.

collectDf = df.limit(10)
collectDf.take(5)
collectDf.show()
collectDf.show(5, "false")
collectDf.collect()

collectDf.toLocalIterator()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
|    United States|          Singapore|    1|
|    United States|            Grenada|   62|
|       Costa Rica|      United States|  588|
|          Senegal|      United States|   40|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+



ValueError: invalid literal for int() with base 10: 'false'