In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

from pyspark.sql.functions import *
from pyspark.sql.types import *

__StructType – Defines the structure of the Dataframe__

StructType is a collection or list of StructField objects.

`printSchema()` method on the DataFrame shows StructType columns as “struct”.

#### Defining Nested StructType object struct

In [2]:
structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]

structureSchema = StructType([
             StructField('name', StructType([
                                             StructField('firstname', StringType(), True),
                                             StructField('middlename', StringType(), True),
                                             StructField('lastname', StringType(), True)
                                            ])
                        ),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

In [3]:
df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+-----+------+------+
|name                |id   |gender|salary|
+--------------------+-----+------+------+
|[James, , Smith]    |36636|M     |3100  |
|[Michael, Rose, ]   |40288|M     |4300  |
|[Robert, , Williams]|42114|M     |1400  |
|[Maria, Anne, Jones]|39192|F     |5500  |
|[Jen, Mary, Brown]  |     |F     |-1    |
+--------------------+-----+------+------+



#### Adding & Changing struct of the DataFrame

Here, it copies “gender“, “salary” and “id” to the new struct “otherInfo” and add’s a new column “Salary_Grade“.

In [6]:
updatedDF = df2.withColumn("OtherInfo", 
                            struct(
                                   col("id").alias("identifier"),
                                   col("gender").alias("gender"),
                                   col("salary").alias("salary"),
                                   when(col("salary").cast(IntegerType()) < 2000,"Low")
                                  .when(col("salary").cast(IntegerType()) < 4000,"Medium")
                                  .otherwise("High").alias("Salary_Grade")
                                  )
                          )

updatedDF.printSchema()
updatedDF.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- OtherInfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: integer (nullable = true)
 |    |-- Salary_Grade: string (nullable = false)

+--------------------+-----+------+------+------------------------+
|name                |id   |gender|salary|OtherInfo               |
+--------------------+-----+------+------+------------------------+
|[James, , Smith]    |36636|M     |3100  |[36636, M, 3100, Medium]|
|[Michael, Rose, ]   |40288|M     |4300  |[40288, M, 4300, High]  |
|[Robert, , Williams]|42114|M     |1400  |[42114, M, 1400, Low]   |
|[Maria, Anne, Jones]|39192|F     |5500  |[39192, F, 5500, High]  |
|[Jen, 

#### Using SQL ArrayType and MapType

SQL StructType also supports ArrayType and MapType to define the DataFrame columns for array and map collections respectively. 

On the below example, column hobbies defined as ArrayType(StringType) and properties defined as MapType(StringType,StringType) meaning both key and value as String.

In [7]:
arrayStructureSchema = StructType([
       StructField('name', 
                   StructType([
                               StructField('firstname', StringType(), True),
                               StructField('middlename', StringType(), True),
                               StructField('lastname', StringType(), True)
                               ])
                  ),
       StructField('hobbies', ArrayType(StringType()), True),
       StructField('properties', MapType(StringType(),StringType()), True)
    ])

#### Creating StructType object struct from JSON file

If you have too many columns and the structure of the DataFrame changes now and then, it’s a good practice to load the SQL StructType schema from JSON file. 

You can get the schema by using `df2.schema.json()` , store this in a file and will use it to create a the schema from this file.

In [8]:
print(df2.schema.json())

{"fields":[{"metadata":{},"name":"name","nullable":true,"type":{"fields":[{"metadata":{},"name":"firstname","nullable":true,"type":"string"},{"metadata":{},"name":"middlename","nullable":true,"type":"string"},{"metadata":{},"name":"lastname","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"gender","nullable":true,"type":"string"},{"metadata":{},"name":"salary","nullable":true,"type":"integer"}],"type":"struct"}


In [10]:
df2.schema.simpleString()

'struct<name:struct<firstname:string,middlename:string,lastname:string>,id:string,gender:string,salary:int>'

In [16]:
import json

jsonSchema = df2.schema.json()
schemaFromJson = StructType.fromJson(json.loads(jsonSchema))

df3 = spark.createDataFrame(
        spark.sparkContext.parallelize(structureData),schemaFromJson)

df3.printSchema()

df3.show()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+-----+------+------+
|                name|   id|gender|salary|
+--------------------+-----+------+------+
|    [James, , Smith]|36636|     M|  3100|
|   [Michael, Rose, ]|40288|     M|  4300|
|[Robert, , Williams]|42114|     M|  1400|
|[Maria, Anne, Jones]|39192|     F|  5500|
|  [Jen, Mary, Brown]|     |     F|    -1|
+--------------------+-----+------+------+



#### Creating StructType object struct from DDL String

In [17]:
ddlSchemaStr = "`fullName` STRUCT<`first`: STRING, `last`: STRING, `middle`: STRING>,`age` INT,`gender` STRING"
ddlSchema = StructType.fromDDL(ddlSchemaStr)
ddlSchema.printTreeString()

AttributeError: type object 'StructType' has no attribute 'fromDDL'

#### Checking if a Column Exists in a DataFrame

In [22]:
# print(df2.schema.fieldNames.contains("firstname"))

print(df2.schema.contains(StructField("firstname",StringType,true)))

AttributeError: 'StructType' object has no attribute 'contains'