##### For complex data analytics, we won't deal only with simple or basic data types. Our data will be complex, often structured or nested, and we need Spark to handle these complex data types. They come in many forms: maps, arrays, struct, dates, timestamps, fields, etc.

https://spark.apache.org/docs/latest/sql-ref-datatypes.html

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

In [4]:
spark = SparkSession.builder.appName("StructType").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/23 22:03:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Defining data
data = [
    ("123456", ("name", "lastname", "age", "review"), ("James", "Smith", "28", "7"), "20230523"),
    ("123456", ("name", "lastname", "age", "review"), ("Robert", "Williams", "32", "5"), "20230401"),
    ("123456", ("name", "lastname", "age", "review"), ("Maria", "Anne", "25", "9"), "20230215"),
    ("123457", ("name", "lastname", "age", "review"), ("Jen", "", "40", "4"), "20230122"),
    ("123458", ("name", "lastname", "age", "review"), ("Michael", "Jones", "", "8"), "20230517"),
    ("123458", ("name", "lastname", "age", "review"), ("William", "Smith", "33", "2"), "20230302")
]

In [10]:
# Defining schema
schema = StructType([
    StructField("id", StringType(), nullable=False),
    StructField("fields_research", StructType([
        StructField("firstname", StringType(), nullable=False),
        StructField("lastname", StringType(), nullable=False),
        StructField("age", StringType(), nullable=False),
        StructField("review", StringType(), nullable=False)
    ])),
    StructField("answers_research", StructType([
        StructField("firstname", StringType(), nullable=True),
        StructField("lastname", StringType(), nullable=True),
        StructField("age", StringType(), nullable=True),
        StructField("review", StringType(), nullable=False)
    ])),
    StructField("partition", StringType(), nullable=False)
])

In [11]:
# Creating DataFrame
df = spark.createDataFrame(data=data, schema=schema)

In [12]:
# Show DataFrame
df.printSchema()
df.show(truncate=False)

root
 |-- id: string (nullable = false)
 |-- fields_research: struct (nullable = true)
 |    |-- firstname: string (nullable = false)
 |    |-- lastname: string (nullable = false)
 |    |-- age: string (nullable = false)
 |    |-- review: string (nullable = false)
 |-- answers_research: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |    |-- age: string (nullable = true)
 |    |-- review: string (nullable = false)
 |-- partition: string (nullable = false)

+------+-----------------------------+-------------------------+---------+
|id    |fields_research              |answers_research         |partition|
+------+-----------------------------+-------------------------+---------+
|123456|{name, lastname, age, review}|{James, Smith, 28, 7}    |20230523 |
|123456|{name, lastname, age, review}|{Robert, Williams, 32, 5}|20230401 |
|123456|{name, lastname, age, review}|{Maria, Anne, 25, 9}     |20230215 |
|123457|{name, last