In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [0]:
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()

In [0]:
df = spark.range(500).toDF("number")
df.select(col('number') + 10).limit(5).display()

(number + 10)
10
11
12
13
14


In [0]:
df.select(df['number']+10).limit(5).display()

(number + 10)
10
11
12
13
14


In [0]:
spark.range(10).collect()

[Row(id=0),
 Row(id=1),
 Row(id=2),
 Row(id=3),
 Row(id=4),
 Row(id=5),
 Row(id=6),
 Row(id=7),
 Row(id=8),
 Row(id=9)]

In [0]:
import urllib.request

url = "https://raw.githubusercontent.com/databricks/Spark-The-Definitive-Guide/refs/heads/master/data/flight-data/json/2015-summary.json"

path = "/dbfs/tmp/2015-summary.json"

urllib.request.urlretrieve(url, path)

('/dbfs/tmp/2015-summary.json', <http.client.HTTPMessage at 0x72593558d750>)

In [0]:
flight_data = spark.read.json('dbfs:/tmp/2015-summary.json')
flight_data.createOrReplaceTempView("flight_data")

In [0]:
flight_data.limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62


In [0]:
flight_data.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

In [0]:
my_manual_schema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType()),
    StructField("ORIGIN_COUNTRY_NAME", StringType()),
    StructField("count", IntegerType())
])

df = spark.read.format("json").schema(my_manual_schema).load("dbfs:/tmp/2015-summary.json")

In [0]:
df.select('count').limit(5).display()

count
15
1
344
15
62


In [0]:
df.select(['count', 'DEST_COUNTRY_NAME']).limit(5).display()

count,DEST_COUNTRY_NAME
15,United States
1,United States
344,United States
15,Egypt
62,United States


In [0]:
from pyspark.sql.functions import expr

### expr() lets you write SQL-like expressions in PySpark code

In [0]:
flight_data.limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62


In [0]:
flight_data.select(expr("concat(DEST_COUNTRY_NAME, '->' ,ORIGIN_COUNTRY_NAME)").alias("combined")).limit(5).display()

combined
United States->Romania
United States->Croatia
United States->Ireland
Egypt->United States
United States->India


In [0]:
flight_data.select(expr("count + 15")).limit(5).display()

(count + 15)
30
16
359
30
77


In [0]:
flight_data.select(expr("CASE \
WHEN DEST_COUNTRY_NAME = 'United States' THEN 'USA' \
ELSE DEST_COUNTRY_NAME \
END").alias('country')).limit(5).display()

country
USA
USA
USA
Egypt
USA


In [0]:
flight_data.withColumn("added_count", expr("count * 10")).limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,added_count
United States,Romania,15,150
United States,Croatia,1,10
United States,Ireland,344,3440
Egypt,United States,15,150
United States,India,62,620


In [0]:
flight_data.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [0]:
flight_data.first()

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)

In [0]:
from pyspark.sql import Row

my_row = Row('INDIA', 'AUS', 1)
my_row[0], my_row[1], my_row[2]

('INDIA', 'AUS', 1)

### DataFrame Transformations

In [0]:
spark.sql("select * from flight_data").limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62


In [0]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

my_manual_schema = StructType([
    StructField("col1", StringType()),
    StructField("col2", IntegerType())
])

first_row = Row("India", 1)
first_df = spark.createDataFrame([first_row], my_manual_schema) 

In [0]:
data = [["Alice", 20], ["Kevin", 21], ["Mirror", 22]]
col_names = ['Name', 'Age']

spark.createDataFrame(data,col_names).display()

Name,Age
Alice,20
Kevin,21
Mirror,22


In [0]:
data2 = [{"name": "Alice", "age": 30, "city": "Mumbai"},{"name": "Yukihira", "age": 17, "city": "Tokyo"}]
spark.createDataFrame(data2).display()


age,city,name
30,Mumbai,Alice
17,Tokyo,Yukihira


In [0]:
import pandas as pd

data3 = {
    "name": ["Alice", "Bob"],
    "age": [30, 25],
    "city": ["Mumbai", "Delhi"]
}

data_df = pd.DataFrame(data3)
data_df = spark.createDataFrame(data_df)


In [0]:
data_df.select('name').display()

name
Alice
Bob


In [0]:
flight_data.select(['DEST_COUNTRY_NAME', 'count']).limit(5).display()

DEST_COUNTRY_NAME,count
United States,15
United States,1
United States,344
Egypt,15
United States,62


In [0]:
flight_data.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME")).limit(5).display()

DEST_COUNTRY_NAME
United States
United States
United States
Egypt
United States


In [0]:
(
    flight_data
    .select(expr("DEST_COUNTRY_NAME").alias('dest'), col("ORIGIN_COUNTRY_NAME").alias("origin"))
    .limit(5)
    .display())

dest,origin
United States,Romania
United States,Croatia
United States,Ireland
Egypt,United States
United States,India


In [0]:
flight_data.selectExpr("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME
United States,Romania
United States,Croatia
United States,Ireland
Egypt,United States
United States,India


In [0]:
(
    flight_data
    .selectExpr("*","(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as same_origin")
    .limit(5)
    .display()
)

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,same_origin
United States,Romania,15,False
United States,Croatia,1,False
United States,Ireland,344,False
Egypt,United States,15,False
United States,India,62,False


In [0]:
spark.sql("select *, (DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as same_origin from flight_data").limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,same_origin
United States,Romania,15,False
United States,Croatia,1,False
United States,Ireland,344,False
Egypt,United States,15,False
United States,India,62,False


In [0]:
flight_data.filter("DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME").display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,United States,370002


In [0]:
flight_data.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").display()

avg(count),count(DISTINCT DEST_COUNTRY_NAME)
1770.765625,132


In [0]:
spark.sql("select avg(count), count(distinct(DEST_COUNTRY_NAME)) from flight_data").display()

avg(count),count(DISTINCT DEST_COUNTRY_NAME)
1770.765625,132


In [0]:
from pyspark.sql.functions import lit

flight_data.select("*", lit(1).alias('one_column')).limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,one_column
United States,Romania,15,1
United States,Croatia,1,1
United States,Ireland,344,1
Egypt,United States,15,1
United States,India,62,1


In [0]:
spark.sql("select *, 1 as one_column from flight_data").limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,one_column
United States,Romania,15,1
United States,Croatia,1,1
United States,Ireland,344,1
Egypt,United States,15,1
United States,India,62,1


In [0]:
from pyspark.sql.functions import col
flight_data.withColumn("number_two", col('count') * 5).limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,number_two
United States,Romania,15,75
United States,Croatia,1,5
United States,Ireland,344,1720
Egypt,United States,15,75
United States,India,62,310


In [0]:
flight_data.withColumn("same_country", col("DEST_COUNTRY_NAME") == col("ORIGIN_COUNTRY_NAME")).limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,same_country
United States,Romania,15,False
United States,Croatia,1,False
United States,Ireland,344,False
Egypt,United States,15,False
United States,India,62,False


In [0]:
spark.sql("select *, DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME as same_country from flight_data").limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,same_country
United States,Romania,15,False
United States,Croatia,1,False
United States,Ireland,344,False
Egypt,United States,15,False
United States,India,62,False


In [0]:
from pyspark.sql.functions import expr
flight_data.withColumn("same_country", expr("DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME")).limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,same_country
United States,Romania,15,False
United States,Croatia,1,False
United States,Ireland,344,False
Egypt,United States,15,False
United States,India,62,False


In [0]:
flight_data.withColumnRenamed("DEST_COUNTRY_NAME", "dest").limit(5).display()

dest,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62


In [0]:
flight_data.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [0]:
flight_data.drop("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").columns

['count']

In [0]:
flight_data.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [0]:
flight_data.withColumn("count", col("count").cast("int")).printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [0]:
spark.sql("select cast(count as int) from flight_data").limit(5).display()

count
15
1
344
15
62


In [0]:
flight_data.filter('DEST_COUNTRY_NAME == ORIGIN_COUNTRY_NAME').limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,United States,370002


In [0]:
flight_data.where('DEST_COUNTRY_NAME == ORIGIN_COUNTRY_NAME').display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,United States,370002


In [0]:
flight_data.filter('count > 10').limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Ireland,344
Egypt,United States,15
United States,India,62
United States,Grenada,62


In [0]:
flight_data.where('count > 10').limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Ireland,344
Egypt,United States,15
United States,India,62
United States,Grenada,62


In [0]:
(
    flight_data
    .filter(col("count") > 10)
    .filter('DEST_COUNTRY_NAME == ORIGIN_COUNTRY_NAME')
    .display())

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,United States,370002


In [0]:
(
    flight_data
    .where('DEST_COUNTRY_NAME == ORIGIN_COUNTRY_NAME')
    .where(col("count") > 10)
    .display()
)

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,United States,370002


In [0]:
spark.sql("select * from flight_data where DEST_COUNTRY_NAME == ORIGIN_COUNTRY_NAME and count > 10").display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,United States,370002


In [0]:
flight_data.select('DEST_COUNTRY_NAME').distinct().count()

132

In [0]:
spark.sql("select count(distinct(DEST_COUNTRY_NAME)) as unique_dest_count from flight_data").display()

unique_dest_count
132


In [0]:
seed = 5
with_replacement = False
fraction = 0.1
flight_data.count(), flight_data.sample(with_replacement, fraction, seed).count()

(256, 28)

In [0]:
df = flight_data.randomSplit([0.25, 0.75], seed)
df[0].count(), df[1].count()

(69, 187)

In [0]:
from pyspark.sql import Row
schema = flight_data.schema
new_rows = [Row("India", "Africa", 245),
            Row("India", "Canada", 123),
            ]
new_df = spark.createDataFrame(new_rows, schema)
flight_data.count(), flight_data.union(new_df).count()

(256, 258)

In [0]:
from pyspark.sql.functions import desc, asc
flight_data.sort("count").limit(5).display(), flight_data.sort(desc("count")).limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
Malta,United States,1
United States,Singapore,1
Moldova,United States,1
United States,Croatia,1
United States,Gibraltar,1


DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,United States,370002
United States,Canada,8483
Canada,United States,8399
United States,Mexico,7187
Mexico,United States,7140


(None, None)

In [0]:
flight_data.orderBy("count").limit(5).display(), flight_data.orderBy(desc("count")).limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
Malta,United States,1
United States,Singapore,1
Moldova,United States,1
United States,Croatia,1
United States,Gibraltar,1


DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,United States,370002
United States,Canada,8483
Canada,United States,8399
United States,Mexico,7187
Mexico,United States,7140


(None, None)

In [0]:
spark.sql("select * from flight_data order by count desc limit 5").display(), spark.sql("select * from flight_data order by count limit 5").display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,United States,370002
United States,Canada,8483
Canada,United States,8399
United States,Mexico,7187
Mexico,United States,7140


DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
Malta,United States,1
United States,Singapore,1
Moldova,United States,1
United States,Croatia,1
United States,Gibraltar,1


(None, None)

In [0]:
flight_data.orderBy(col("count").desc()).limit(5).display()

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,United States,370002
United States,Canada,8483
Canada,United States,8399
United States,Mexico,7187
Mexico,United States,7140


In [0]:
flight_data.rdd.getNumPartitions()

1

In [0]:
flight_data.repartition(3)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [0]:
flight_data.repartition(col('DEST_COUNTRY_NAME'))

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [0]:
flight_data.repartition(3, col('DEST_COUNTRY_NAME')).coalesce(2)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [0]:
flight_data.take(5)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62)]

In [0]:
flight_data.limit(5).collect()

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62)]

In [0]:
flight_data.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [0]:
flight_data.tail(5)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Saint Kitts and Nevis', count=145),
 Row(DEST_COUNTRY_NAME='Uruguay', ORIGIN_COUNTRY_NAME='United States', count=43),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Haiti', count=225),
 Row(DEST_COUNTRY_NAME='Bonaire, Sint Eustatius, and Saba', ORIGIN_COUNTRY_NAME='United States', count=58),
 Row(DEST_COUNTRY_NAME='Greece', ORIGIN_COUNTRY_NAME='United States', count=30)]