In [0]:
from pyspark.sql.functions import explode, col


In [0]:
source_json = """
{
    "persons": [
        {
            "name": "John",
            "age": 30,
            "cars": [
                {
                    "name": "Ford",
                    "models": [
                        "Fiesta",
                        "Focus",
                        "Mustang"
                    ]
                },
                {
                    "name": "BMW",
                    "models": [
                        "320",
                        "X3",
                        "X5"
                    ]
                }
            ]
        },
        {
            "name": "Peter",
            "age": 46,
            "cars": [
                {
                    "name": "Huyndai",
                    "models": [
                        "i10",
                        "i30"
                    ]
                },
                {
                    "name": "Mercedes",
                    "models": [
                        "E320",
                        "E63 AMG"
                    ]
                }
            ]
        }
    ]
}
"""

In [0]:
dbutils.fs.rm('/FileStore/tables/multiline_nested_json.json')

In [0]:
dbutils.fs.put('/FileStore/tables/multiline_nested_json.json',source_json)

In [0]:
source_df = spark.read.option("multiline", "true").json("/FileStore/tables/multiline_nested_json.json")
display(source_df)

persons
"List(List(30, List(List(List(Fiesta, Focus, Mustang), Ford), List(List(320, X3, X5), BMW)), John), List(46, List(List(List(i10, i30), Huyndai), List(List(E320, E63 AMG), Mercedes)), Peter))"


In [0]:
# Explode all persons into different rows
persons = source_df.select(explode("persons").alias("persons"))
display(persons)

persons
"List(30, List(List(List(Fiesta, Focus, Mustang), Ford), List(List(320, X3, X5), BMW)), John)"
"List(46, List(List(List(i10, i30), Huyndai), List(List(E320, E63 AMG), Mercedes)), Peter)"


In [0]:
# Explode all car brands into different rows
persons_cars = persons.select(
   col("persons.name").alias("persons_name")
 , col("persons.age").alias("persons_age")
 , explode("persons.cars").alias("persons_cars_brands")
 , col("persons_cars_brands.name").alias("persons_cars_brand")
)
display(persons_cars)

persons_name,persons_age,persons_cars_brands,persons_cars_brand
John,30,"List(List(Fiesta, Focus, Mustang), Ford)",Ford
John,30,"List(List(320, X3, X5), BMW)",BMW
Peter,46,"List(List(i10, i30), Huyndai)",Huyndai
Peter,46,"List(List(E320, E63 AMG), Mercedes)",Mercedes


In [0]:
# Explode all car models into different rows
persons_cars_models = persons_cars.select(
   col("persons_name")
 , col("persons_age")
 , col("persons_cars_brand")
 , explode("persons_cars_brands.models").alias("persons_cars_model")
)
display(persons_cars_models)

persons_name,persons_age,persons_cars_brand,persons_cars_model
John,30,Ford,Fiesta
John,30,Ford,Focus
John,30,Ford,Mustang
John,30,BMW,320
John,30,BMW,X3
John,30,BMW,X5
Peter,46,Huyndai,i10
Peter,46,Huyndai,i30
Peter,46,Mercedes,E320
Peter,46,Mercedes,E63 AMG


In [0]:
persons_cars_models.write.format('delta').mode('append').saveAsTable('delta.customer_cars')


In [0]:
%sql
select * from delta.customer_cars