In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.getOrCreate()

In [9]:
! head work/sample.csv

name,city
srikanth,[bangalore,tirupati]
jeevitha,[madanapalli,tirupati]
sebastian,[hyderabad,tirupati]
kumar,[kadapa,chittoor]


In [75]:
df = spark.read.format("csv").option("header","true").load("work/sample.csv")
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)

+---------+--------------+
|name     |city          |
+---------+--------------+
|srikanth |['bangalore'  |
|jeevitha |['madanapalli'|
|sebastian|['hyderabad'  |
|kumar    |['kadapa'     |
+---------+--------------+



In [76]:
from pyspark.sql.functions import explode, split, col, regexp_replace
df = df.withColumn("city", regexp_replace(col("city"), "[\\[\\]]", ""))
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)

+---------+-------------+
|name     |city         |
+---------+-------------+
|srikanth |'bangalore'  |
|jeevitha |'madanapalli'|
|sebastian|'hyderabad'  |
|kumar    |'kadapa'     |
+---------+-------------+



In [77]:
df = df.withColumn("city", split(col("city"), ","))
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- city: array (nullable = true)
 |    |-- element: string (containsNull = false)

+---------+---------------+
|name     |city           |
+---------+---------------+
|srikanth |['bangalore']  |
|jeevitha |['madanapalli']|
|sebastian|['hyderabad']  |
|kumar    |['kadapa']     |
+---------+---------------+



In [63]:
df_exploded = df.withColumn("city", explode(col("city")))
df_exploded.printSchema()
df_exploded.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = false)

+---------+-------------+
|name     |city         |
+---------+-------------+
|srikanth |"bangalore"  |
|jeevitha |"madanapalli"|
|sebastian|"hyderabad"  |
|kumar    |"kadapa"     |
+---------+-------------+



DataFrame[name: string, city: string]

In [66]:
from pyspark.sql.functions import trim

df_exploded = df_exploded.withColumn('city', trim(col("city")))

# Show the result
df_exploded.show(truncate=False)

+---------+-------------+
|name     |city         |
+---------+-------------+
|srikanth |"bangalore"  |
|jeevitha |"madanapalli"|
|sebastian|"hyderabad"  |
|kumar    |"kadapa"     |
+---------+-------------+



In [18]:
from pyspark.sql.functions import explode, col

In [20]:
df.withColumn("cities", explode(col('city')))

AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "explode(city)" due to data type mismatch: Parameter 1 requires the ("ARRAY" or "MAP") type, however "city" has the type "STRING".;
'Project [name#17, city#18, explode(city#18) AS cities#33]
+- Relation [name#17,city#18] csv


In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col

# Initialize a Spark session
spark = SparkSession.builder.appName("explodeExample").getOrCreate()

# Sample data
data = [
    ("PMI", "OPO", [2, 1]),
    ("ATH", "BCN", [3]),
    ("JFK", "MAD", [5, 4, 6]),
    ("HND", "LAX", [8, 9, 7, 0])
]

# Create DataFrame
df = spark.createDataFrame(data, ["origin", "destination", "internal_flight_ids"])


In [24]:
df.printSchema()

root
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)
 |-- internal_flight_ids: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [23]:
# Transform the DataFrame
df_exploded = df.withColumn("internal_flight_ids", explode(col("internal_flight_ids")))

# Show the result
df_exploded.show()

+------+-----------+-------------------+
|origin|destination|internal_flight_ids|
+------+-----------+-------------------+
|   PMI|        OPO|                  2|
|   PMI|        OPO|                  1|
|   ATH|        BCN|                  3|
|   JFK|        MAD|                  5|
|   JFK|        MAD|                  4|
|   JFK|        MAD|                  6|
|   HND|        LAX|                  8|
|   HND|        LAX|                  9|
|   HND|        LAX|                  7|
|   HND|        LAX|                  0|
+------+-----------+-------------------+



In [35]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col

# Initialize a Spark session
spark = SparkSession.builder.appName("explodeExample").getOrCreate()

# Sample data
data = [
    ("srikanth", ["bangalore", "tirupati"]),
    ("jeevitha", ["madanapalli", "tirupati"]),
    ("sebastian", ["hyderabad", "tirupati"]),
    ("kumar", ["kadapa", "chittoor"])
]

# Create DataFrame
df = spark.createDataFrame(data, ["name", "city"])

# Transform the DataFrame
df_exploded = df.withColumn("city", explode(col("city")))

# Show the result
df_exploded.show()


+---------+-----------+
|     name|       city|
+---------+-----------+
| srikanth|  bangalore|
| srikanth|   tirupati|
| jeevitha|madanapalli|
| jeevitha|   tirupati|
|sebastian|  hyderabad|
|sebastian|   tirupati|
|    kumar|     kadapa|
|    kumar|   chittoor|
+---------+-----------+

