# Ex-2030 - Data Frame Structure


In [1]:
# 1. Import SparkSession and necessary types
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, DateType
from pyspark.sql.types import _parse_datatype_string

# Initialize Spark session
spark = SparkSession.builder.appName("CustomerContacts").getOrCreate()

In [2]:
# Define schema using StructType with explicit types
structured_schema = StructType([
    StructField("FullName", StringType(), True),
    StructField("Street", StringType(), True),
    StructField("City", StringType(), True),
    StructField("PostalCode", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("IsActive", BooleanType(), True),
    StructField("LastContactDate", DateType(), True)
])

In [3]:
# 2. Create data with an intentional error (invalid "IsActive" value)
from datetime import datetime

data = [
    ("John Doe", "123 Main St", "New York", "10001", "USA", True, datetime.strptime("2024-05-01", "%Y-%m-%d")),
    ("Jane Smith", "456 Elm St", "London", "SW1A 1AA", "UK", False, datetime.strptime("2024-04-15", "%Y-%m-%d")),
    ("Alice Johnson", "789 Oak St", "Berlin", "10115", "Germany", "inactive", datetime.strptime("2024-03-20", "%Y-%m-%d"))  # Error: Invalid Boolean value
]

# Attempt to create a DataFrame with structured schema
try:
    df = spark.createDataFrame(data, schema=structured_schema)
    df.show()
except Exception as e:
    print(f"Error creating DataFrame: {e}")


Error creating DataFrame: [CANNOT_ACCEPT_OBJECT_IN_TYPE] `BooleanType()` can not accept object `inactive` in type `str`.


In [4]:
# 3. Define schema using a simplified string format
schema_string = "FullName STRING, Street STRING, City STRING, PostalCode STRING, Country STRING, IsActive BOOLEAN, LastContactDate DATE"

try:
    df = spark.createDataFrame(data, schema=schema_string)
    df.show()
except Exception as e:
    print(f"Error creating DataFrame: {e}")


Error creating DataFrame: [CANNOT_ACCEPT_OBJECT_IN_TYPE] `BooleanType()` can not accept object `inactive` in type `str`.


In [5]:
# Compare the structure generated automatically for the schema_string
_parse_datatype_string(schema_string)

StructType([StructField('FullName', StringType(), True), StructField('Street', StringType(), True), StructField('City', StringType(), True), StructField('PostalCode', StringType(), True), StructField('Country', StringType(), True), StructField('IsActive', BooleanType(), True), StructField('LastContactDate', DateType(), True)])