In [4]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [5]:
df = spark.read.json("people.json")
df.show()

+---+-------+
|age|   name|
+---+-------+
| 28|   John|
| 36| Andrew|
| 22| Clarke|
| 42|  Kevin|
| 51|Richard|
+---+-------+



In [6]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [7]:
df.select("name").show()

+-------+
|   name|
+-------+
|   John|
| Andrew|
| Clarke|
|  Kevin|
|Richard|
+-------+



In [8]:
df.select(df['name'], df['age'] + 1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|   John|       29|
| Andrew|       37|
| Clarke|       23|
|  Kevin|       43|
|Richard|       52|
+-------+---------+



In [9]:
df.filter(df['age'] > 42).show()

+---+-------+
|age|   name|
+---+-------+
| 51|Richard|
+---+-------+



In [10]:
df.groupBy("age").count().show()

+---+-----+
|age|count|
+---+-----+
| 22|    1|
| 51|    1|
| 28|    1|
| 36|    1|
| 42|    1|
+---+-----+



In [11]:
df.createOrReplaceTempView("people")
sqlDF = spark.sql("SELECT * FROM people where age>42")
sqlDF.show()

+---+-------+
|age|   name|
+---+-------+
| 51|Richard|
+---+-------+



In [12]:
df.createGlobalTempView("people")
spark.sql("SELECT * FROM global_temp.people").show()

+---+-------+
|age|   name|
+---+-------+
| 28|   John|
| 36| Andrew|
| 22| Clarke|
| 42|  Kevin|
| 51|Richard|
+---+-------+



In [13]:
spark.newSession().sql("SELECT * FROM global_temp.people").show()

+---+-------+
|age|   name|
+---+-------+
| 28|   John|
| 36| Andrew|
| 22| Clarke|
| 42|  Kevin|
| 51|Richard|
+---+-------+

