### Types

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType
from pyspark.sql.types import StructType, StructField, StringType

data = [
    ('{"id":1,"name":"u1","meta":{"country":"IN","tier":"gold"}}', '[1,2,3]', '{"device":"android","ip":"10.0.0.1","os":"13"}'),
    ('{"id":2,"name":"u2","meta":{"country":"IN","tier":"silver"}}', '[2,3,4]', '{"browser":"chrome","session_id":"abc123","latency_ms":"240"}'),
    ('{"id":3,"name":"u3","meta":{"country":"US","tier":"gold"}}', '[3,4,5]', '{"risk_score":"0.87","rule":"geo_mismatch"}'),
    ('{"id":4,"name":"u4","meta":{"country":"US","tier":"bronze"}}', '[4,5,6]', '{"country":"IN","currency":"INR"}'),
    ('{"id":5,"name":"u5","meta":{"country":"GB","tier":"gold"}}', '[5,6,7]', '{"app_version":"5.2.1","build":"9012"}'),
    ('{"id":6,"name":"u6","meta":{"country":"GB","tier":"silver"}}', '[6,7,8]', '{"fraud":"true","reason":"velocity"}'),
    ('{"id":7,"name":"u7","meta":{"country":"DE","tier":"gold"}}', '[7,8,9]', '{"region":"EU","gdpr":"yes"}'),
    ('{"id":8,"name":"u8","meta":{"country":"DE","tier":"bronze"}}', '[8,9,10]', '{"merchant":"m123","tier":"gold"}'),
    ('{"id":9,"name":"u9","meta":{"country":"FR","tier":"gold"}}', '[9,10,11]', '{"channel":"web","utm":"spring"}'),
    ('{"id":10,"name":"u10","meta":{"country":"FR","tier":"silver"}}', '[10,11,12]', '{"score":"42","model":"xgboost"}'),
]

schema = StructType([
    StructField("struct_json", StringType(), True),
    StructField("array_json", StringType(), True),
    StructField("map_json", StringType(), True),
])

df = spark.createDataFrame(data, schema)

df.display()

In [0]:
struct_schema = "struct<id:integer, name:string, meta:struct<country:string, tier:string>>"
array_schema = "array<int>"
map_schema = "map<string, string>"

df.withColumn('struct_casted', F.from_json('struct_json', struct_schema) )\
    .withColumn('array_casted', F.from_json('array_json', array_schema) )\
        .withColumn('map_casted', F.from_json('map_json', map_schema) ).display()


### Functions like explode...

In [0]:
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("items", ArrayType(StringType(), True), True),  # can be null
    StructField("attrs", MapType(StringType(), StringType(), True), True),  # can be null
    StructField("events", ArrayType(
        StructType([
            StructField("ts", StringType(), True),
            StructField("type", StringType(), True),
        ]),
        True
    ), True),
])

data = [
    (1, ["a", "b"], {"country": "IN", "tier": "gold"}, [{"ts": "2025-01-01T10:00:00Z", "type": "click"}, {"ts": "2025-01-01T10:01:00Z", "type": "pay"}]),
    (2, [], {"country": "US"}, []),
    (3, None, None, None),
    (4, ["x"], {}, [{"ts": "2025-01-02T12:00:00Z", "type": "view"}]),
]

df = spark.createDataFrame(data, schema)
df.display()

In [0]:
df.select('*', F.explode('events') ).display()
df.select('*', F.inline('events') ).display()

df.select('*', F.explode('attrs') ).display()
df.select('*', F.map_entries('attrs') ).display()

In [0]:
df.select('*', F.explode('items') ).display()
df.select('*', F.explode_outer('items') ).display()

df.select('*', F.posexplode('items').alias('i', 'item') ).display()
df.select('*', F.posexplode_outer('items') ).display()

In [0]:
df.withColumn('attrs_exploded', F.explode('attrs') ).display()


In [0]:
# items -> array -> explode into multiple rows -> gives one column
# attrs -> key:value -> gives two columns
# events -> 

In [0]:
# 1) explode: one output row per element (drops rows where array/map is null; empty -> 0 rows)
df_explode_items = df.select("id", F.explode("items").alias("item"))
df_explode_items.show(truncate=False)

# 2) explode_outer: keeps rows where array/map is null (emits one row with null)
df_explode_outer_items = df.select("id", F.explode_outer("items").alias("item"))
df_explode_outer_items.show(truncate=False)

# 3) posexplode: like explode, but includes position index for arrays (drops null)
df_posexplode_items = df.select("id", F.posexplode("items").alias("pos", "item"))
df_posexplode_items.show(truncate=False)

# 4) posexplode_outer: keeps null arrays (emits one row with pos/item null)
df_posexplode_outer_items = df.select("id", F.posexplode_outer("items").alias("pos", "item"))
df_posexplode_outer_items.show(truncate=False)

# 5) explode map: explode(map) returns key/value columns
df_explode_attrs = df.select("id", F.explode("attrs").alias("k", "v"))
df_explode_attrs.show(truncate=False)

# 6) map_entries + explode: alternative that produces a struct(key,value) then explode
df_map_entries = df.select("id", F.explode(F.map_entries("attrs")).alias("entry"))
df_map_entries.select("id", F.col("entry.key").alias("k"), F.col("entry.value").alias("v")).show(truncate=False)

# 7) inline: turns array<struct<...>> into multiple rows + expands struct fields into columns
df_inline_events = df.select("id", F.inline("events"))
df_inline_events.show(truncate=False)

# 8) inline_outer: keeps rows where events is null (one row with null expanded fields)
df_inline_outer_events = df.select("id", F.inline_outer("events"))
df_inline_outer_events.show(truncate=False)
