In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window
spark = SparkSession.builder.appName("PySparkPractice").getOrCreate()


## Simple JSON File

In [9]:
data = [
    {"id": 1, "name": "Riya"},
    {"id": 2, "name": "Amit"}
]

df = spark.createDataFrame(data)
df.show()
df.printSchema()

+---+----+
| id|name|
+---+----+
|  1|Riya|
|  2|Amit|
+---+----+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



## Multiline (Pretty) JSON


In [10]:
data = [{
  "id": 1,
  "name": "Riya",
  "city": "Bangalore"
}]
df = spark.createDataFrame(data)
df.printSchema()
df.show()

root
 |-- city: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

+---------+---+----+
|     city| id|name|
+---------+---+----+
|Bangalore|  1|Riya|
+---------+---+----+



## Nested JSON

In [11]:
data = [{
  "id": 1,
  "name": "Riya",
  "address": {
    "city": "Bangalore",
    "pincode": 560001
  }
}]
df = spark.createDataFrame(data)
df.show()
df.printSchema()

df2 = df.select(
    F.col('id'),
    F.col('name'),
    F.col('address.city').alias('city'),
    F.col('address.pincode').alias('pincode')
)
df2.show()
df2.printSchema()

+--------------------+---+----+
|             address| id|name|
+--------------------+---+----+
|{pincode -> 56000...|  1|Riya|
+--------------------+---+----+

root
 |-- address: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

+---+----+---------+-------+
| id|name|     city|pincode|
+---+----+---------+-------+
|  1|Riya|Bangalore| 560001|
+---+----+---------+-------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- pincode: string (nullable = true)



## JSON with Arrays

In [12]:
data = [{
  "id": 1,
  "name": "Riya",
  "skills": ["python", "sql", "spark"]
}]


df = spark.createDataFrame(data)
df.printSchema()
df.show()


df2 = df.withColumn('items', F.explode(F.col('skills')))
df2 = df2.select(
    F.col('id'),
    F.col('name'),
    F.col('items').alias('skills'),
)
df2.printSchema()
df2.show()





root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+---+----+--------------------+
| id|name|              skills|
+---+----+--------------------+
|  1|Riya|[python, sql, spark]|
+---+----+--------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: string (nullable = true)

+---+----+------+
| id|name|skills|
+---+----+------+
|  1|Riya|python|
|  1|Riya|   sql|
|  1|Riya| spark|
+---+----+------+



## Complex Multi-level Nested JSON

In [14]:
data =[{
  "order_id": 1001,
  "customer": {
    "id": 50,
    "name": "Riya",
    "location": {
      "city": "Bangalore",
      "zipcode": 560001
    }
  },
  "items": [
    {"sku": "A1", "qty": 2, "price": 100},
    {"sku": "B1", "qty": 1, "price": 200}
  ]
},
{
  "order_id": 1002,
  "customer": {
    "id": 51,
    "name": "Riya",
    "location": {
      "city": "Bangalore",
      "zipcode": 560001
    }
  },
  "items": [
    {"sku": "A1", "qty": 2, "price": 100},
    {"sku": "B1", "qty": 1, "price": 200}
  ]
}]

schema = StructType([
    StructField("order_id", IntegerType()),
    StructField("customer", StructType([
        StructField("id", IntegerType()),
        StructField("name", StringType()),
        StructField("location", StructType([
            StructField("city", StringType()),
            StructField("zipcode", IntegerType())
        ]))
    ])),
    StructField("items", ArrayType(
        StructType([
            StructField("sku", StringType()),
            StructField("qty", IntegerType()),
            StructField("price", IntegerType())
        ])
    ))
])


df = spark.createDataFrame(data, schema)
# df.printSchema()
# df.show()
df2 = df.withColumn('item', F.explode(F.col('items')))
df2 = df2.select(
      F.col('order_id'),
      F.col('customer.id').alias('customer_id'),
      F.col('customer.name').alias('name'),
      F.col('customer.location.city').alias('city'),
      F.col('customer.location.zipcode').alias('zipcode'),
      F.col('item.sku').alias('sku'),
      F.col('item.qty').alias('qty'),
      F.col('item.price').alias('price')
)
df2.printSchema()
df2.show()



root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- zipcode: integer (nullable = true)
 |-- sku: string (nullable = true)
 |-- qty: integer (nullable = true)
 |-- price: integer (nullable = true)

+--------+-----------+----+---------+-------+---+---+-----+
|order_id|customer_id|name|     city|zipcode|sku|qty|price|
+--------+-----------+----+---------+-------+---+---+-----+
|    1001|         50|Riya|Bangalore| 560001| A1|  2|  100|
|    1001|         50|Riya|Bangalore| 560001| B1|  1|  200|
|    1002|         51|Riya|Bangalore| 560001| A1|  2|  100|
|    1002|         51|Riya|Bangalore| 560001| B1|  1|  200|
+--------+-----------+----+---------+-------+---+---+-----+

