Basic Structured Operations

Schemas

In [None]:
from pyspark.shell import spark

# Create a Dataframe
df = spark.read \
    .format("json") \
    .load("/datasets/flight-data/json/2015-summary.json")

df.printSchema()

In [None]:
# Analyzing schema in Dataframe
spark.read \
    .format("json") \
    .load("/datasets/flight-data/json/2015-summary.json") \
    .schema

In [None]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

# Exploring Spark's complex types
myManualSchema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False, metadata={"hello":"world"})
])

df = spark.read \
    .format("json") \
    .schema(myManualSchema) \
    .load("/datasets/flight-data/json/2015-summary.json")

Columns

In [None]:
from pyspark.sql.functions import col, column
col("someColumnName")
column("someColumnName")

Columns as expressions

In [None]:
from pyspark.sql.functions import expr
expr("(((someCol + 5) * 200) - 6) < otherCol")

Creating Rows

In [None]:
from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)

print(myRow[0])
print(myRow[2])

Creating DataFrames

In [None]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType

In [None]:
df = spark.read \
    .format("json") \
    .schema(myManualSchema) \
    .load("/datasets/flight-data/json/2015-summary.json")

df.createOrReplaceTempView("dfTable")

In [None]:
myManualSchema = StructType([
    StructField("some", StringType(), True),
    StructField("col", StringType(), True),
    StructField("names", LongType(), False)
])

myRow = Row("Hello", None, 1)
myDF = spark.createDataFrame([myRow], myManualSchema)
myDF.show()

select and selectExpr

In [None]:
df.select("DEST_COUNTRY_NAME").show(5)

In [None]:
df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(5)

In [None]:
from pyspark.sql.functions import expr, col, column
df.select(
    expr("DEST_COUNTRY_NAME"),
    col("DEST_COUNTRY_NAME"),
    column("DEST_COUNTRY_NAME")
).show(2)

In [None]:
df.select(expr("DEST_COUNTRY_NAME AS destination")).show(5)

In [None]:
df.select(expr("DEST_COUNTRY_NAME AS destination").alias("DEST_COUNTRY_NAME")).show(5)

In [None]:
df.selectExpr("DEST_COUNTRY_NAME as newColumnName", "DEST_COUNTRY_NAME").show(5)

In [None]:
df.selectExpr(
    "*", # all original columns
    "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry"
).show(5)

In [None]:
df.selectExpr(
    "AVG(count)",
    "COUNT(DISTINCT(DEST_COUNTRY_NAME))"
).show(5)

Converting to Spark Types

In [None]:
from pyspark.sql.functions import lit

df.select(expr("*"), lit(1).alias("One")).show(5)

Adding Columns

In [None]:
df.withColumn("numberOne", lit(1)).show(5)

In [None]:
df.withColumn("withinCountry", expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME")).show(5)

Renaming Columns

In [None]:
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

Reserved Characters and Keywords

In [None]:
dfWithLongColName = df.withColumn(
    'This Long Column-Name',
    expr("ORIGIN_COUNTRY_NAME")
)

dfWithLongColName.selectExpr(
    "`This Long Column-Name`",
    "`This Long Column-Name` as `new col`"
).show(5)

In [None]:
dfWithLongColName.select(col("This Long Column-Name")).columns

Removing Columns

In [None]:
df.drop("ORIGIN_COUNTRY_NAME").columns

In [None]:
dfWithLongColName.drop("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME")

Changing Column's Type

In [None]:
df.withColumn("count2", col("count").cast("long"))

Filtering Rows

In [None]:
df.filter(col("count") < 2).show(5)

In [None]:
df.where("count < 2").show(5)

In [None]:
df.where(col("count") < 2) \
    .where(col("ORIGIN_COUNTRY_NAME") != "Croatia") \
    .show(5)

Getting Unique Rows

In [None]:
df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME") \
    .distinct() \
    .count()

In [None]:
df.select("ORIGIN_COUNTRY_NAME") \
    .distinct() \
    .count()

Random Samples

In [None]:
seed = 5
withReplacement = False
fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

Random Splits

In [None]:
dataFrames = df.randomSplit([0.25, 0.75], seed)
dataFrames[0].count() > dataFrames[1].count() # False

Concatenating and Appending Rows (Union)

In [None]:
from pyspark.sql import Row
from pyspark.sql.functions import col

schema = df.schema
newRows = [
    Row("New Country 1", "Other Country 1", 5),
    Row("New Country 2", "Other Country 3", 1)
]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)

df.union(newDF) \
    .where(col("count") == 1) \
    .where(col("ORIGIN_COUNTRY_NAME") != "United States") \
    .show()

Sorting Rows

In [None]:
df.sort("count").show(5)
df.orderBy("count", "DEST_COUNTRY_NAME").show(5)
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(5)

In [None]:
from pyspark.sql.functions import desc, asc

df.orderBy(expr("count desc")).show(5)
df.orderBy(col("count").desc(), col("DEST_COUNTRY_NAME").asc()).show(5)

In [None]:
spark.read \
    .format("json") \
    .load("/datasets/flight-data/json/*-summary.json") \
    .sortWithinPartitions("count")

Limit

In [None]:
df.limit(5).show()

In [None]:
df.orderBy(expr("count desc")).limit(5).show()

Repartition and Coalesce

In [None]:
df.rdd.getNumPartitions()

df.repartition(5)

df.repartition(col("DEST_COUNTRY_NAME"))

df.repartition(5, col("DEST_COUNTRY_NAME"))

In [None]:
df.repartition(5, col("DEST_COUNTRY_NAME")).coalesce(2)

Collecting Rows to the Driver

In [None]:
collectDF = df.limit(10)
collectDF.take(5) # take works with an Integer count
collectDF.show() # this prints it out nicely
collectDF.show(5, False)
collectDF.collect()

End