In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_json, struct
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = SparkSession.builder.appName("json_examples").getOrCreate()

# Sample data
data = [("John", 30, "New York"), ("Alice", 25, "Los Angeles")]
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True)
])

df = spark.createDataFrame(data, schema)

In [5]:
df.show()

+-----+---+-----------+
| name|age|       city|
+-----+---+-----------+
| John| 30|   New York|
|Alice| 25|Los Angeles|
+-----+---+-----------+



In [33]:
df_json = df.withColumn("struct_data", struct("name", "age", "city")).withColumn("json_data", to_json("struct_data"))

In [61]:
df_json.show(truncate=False)
df_json.printSchema()

+-----+---+-----------+------------------------+----------------------------------------------+
|name |age|city       |struct_data             |json_data                                     |
+-----+---+-----------+------------------------+----------------------------------------------+
|John |30 |New York   |{John, 30, New York}    |{"name":"John","age":30,"city":"New York"}    |
|Alice|25 |Los Angeles|{Alice, 25, Los Angeles}|{"name":"Alice","age":25,"city":"Los Angeles"}|
+-----+---+-----------+------------------------+----------------------------------------------+

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- struct_data: struct (nullable = false)
 |    |-- name: string (nullable = true)
 |    |-- age: integer (nullable = true)
 |    |-- city: string (nullable = true)
 |-- json_data: string (nullable = true)



In [13]:
from pyspark.sql.functions import from_json

In [45]:
print(type(('{"name":"John","age":30,"city":"New York"}',)))  # <class 'tuple'>
print(type(('{"name":"John","age":30,"city":"New York"}')))  # <class 'str'>


<class 'tuple'>
<class 'str'>


In [57]:
# Sample JSON data
json_data = [
    ('{"name":"John","age":30,"city":"New York"}',),
    ('{"name":"Alice","age":25,"city":"Los Angeles"}',)
]
_schema = "json_string string"
# json_df = spark.createDataFrame(json_data, schema="json_string string",)
json_df = spark.createDataFrame(json_data, ["json_string"])
json_df.printSchema()
json_df.show(truncate=False)

root
 |-- json_string: string (nullable = true)

+----------------------------------------------+
|json_string                                   |
+----------------------------------------------+
|{"name":"John","age":30,"city":"New York"}    |
|{"name":"Alice","age":25,"city":"Los Angeles"}|
+----------------------------------------------+



In [67]:
# Define the schema for parsing
json_schema = StructType([
    StructField("name", StringType()),
    StructField("age", IntegerType()),
    StructField("city", StringType())
])

# Parse JSON string into struct
parsed_df = json_df.withColumn("parsed_data", from_json("json_string", json_schema))
parsed_df.show(truncate=False)
parsed_df.printSchema()
parsed_df.select("parsed_data.name").show()
parsed_df.select("parsed_data.*").show()

+----------------------------------------------+------------------------+
|json_string                                   |parsed_data             |
+----------------------------------------------+------------------------+
|{"name":"John","age":30,"city":"New York"}    |{John, 30, New York}    |
|{"name":"Alice","age":25,"city":"Los Angeles"}|{Alice, 25, Los Angeles}|
+----------------------------------------------+------------------------+

root
 |-- json_string: string (nullable = true)
 |-- parsed_data: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: integer (nullable = true)
 |    |-- city: string (nullable = true)

+-----+
| name|
+-----+
| John|
|Alice|
+-----+

+-----+---+-----------+
| name|age|       city|
+-----+---+-----------+
| John| 30|   New York|
|Alice| 25|Los Angeles|
+-----+---+-----------+

