In [3]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [4]:
# spark is an existing SparkSession
df = spark.read.json("people.json")
# Displays the content of the DataFrame to stdout
df.show()

+---+----+-------+
|age|  id|   name|
+---+----+-------+
| 25|1201| satish|
| 28|1202|  haris|
| 39|1203|  amith|
| 23|1204|  javed|
| 23|1205| prudvi|
| 23|1206|Michael|
| 30|1207|   Andy|
| 19|1208|    jon|
| 19|1208|  maria|
| 19|1208|gabriel|
+---+----+-------+



In [5]:
# spark, df are from the previous example
# Print the schema in a tree format
df.printSchema()

root
 |-- age: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



In [6]:
# Select only the "name" column
df.select("name").show()

+-------+
|   name|
+-------+
| satish|
|  haris|
|  amith|
|  javed|
| prudvi|
|Michael|
|   Andy|
|    jon|
|  maria|
|gabriel|
+-------+



In [7]:
# Select everybody, but increment the age by 1
df.select(df['name'], df['age'] + 1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
| satish|     26.0|
|  haris|     29.0|
|  amith|     40.0|
|  javed|     24.0|
| prudvi|     24.0|
|Michael|     24.0|
|   Andy|     31.0|
|    jon|     20.0|
|  maria|     20.0|
|gabriel|     20.0|
+-------+---------+



In [8]:
# Select people older than 21
df.filter(df['age'] > 21).show()

+---+----+-------+
|age|  id|   name|
+---+----+-------+
| 25|1201| satish|
| 28|1202|  haris|
| 39|1203|  amith|
| 23|1204|  javed|
| 23|1205| prudvi|
| 23|1206|Michael|
| 30|1207|   Andy|
+---+----+-------+



In [9]:

# Count people by age
df.groupBy("age").count().show()

+---+-----+
|age|count|
+---+-----+
| 30|    1|
| 28|    1|
| 19|    3|
| 23|    3|
| 25|    1|
| 39|    1|
+---+-----+



In [10]:
# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")

sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()

+---+----+-------+
|age|  id|   name|
+---+----+-------+
| 25|1201| satish|
| 28|1202|  haris|
| 39|1203|  amith|
| 23|1204|  javed|
| 23|1205| prudvi|
| 23|1206|Michael|
| 30|1207|   Andy|
| 19|1208|    jon|
| 19|1208|  maria|
| 19|1208|gabriel|
+---+----+-------+



In [11]:
# Register the DataFrame as a global temporary view
df.createGlobalTempView("people")

# Global temporary view is tied to a system preserved database `global_temp`
spark.sql("SELECT * FROM global_temp.people").show()

+---+----+-------+
|age|  id|   name|
+---+----+-------+
| 25|1201| satish|
| 28|1202|  haris|
| 39|1203|  amith|
| 23|1204|  javed|
| 23|1205| prudvi|
| 23|1206|Michael|
| 30|1207|   Andy|
| 19|1208|    jon|
| 19|1208|  maria|
| 19|1208|gabriel|
+---+----+-------+



In [12]:

# Global temporary view is cross-session
spark.newSession().sql("SELECT * FROM global_temp.people").show()

+---+----+-------+
|age|  id|   name|
+---+----+-------+
| 25|1201| satish|
| 28|1202|  haris|
| 39|1203|  amith|
| 23|1204|  javed|
| 23|1205| prudvi|
| 23|1206|Michael|
| 30|1207|   Andy|
| 19|1208|    jon|
| 19|1208|  maria|
| 19|1208|gabriel|
+---+----+-------+

