# Split and indexing

In [None]:
# What is split in PySpark?
# split is a function that splits a string column into an array of substrings based on a delimiter (like comma, space, hyphen, etc.).
# The result is an array column, not separate columns directly.
# You can later extract elements from the array using indexing.

In [1]:
from pyspark.sql.functions import split, col
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('split').getOrCreate()

data = [("Alice,25,USA",), ("Bob,30,UK",), ("Charlie,35,India",)]
df = spark.createDataFrame(data, ["info"])
df.show()


+----------------+
|            info|
+----------------+
|    Alice,25,USA|
|       Bob,30,UK|
|Charlie,35,India|
+----------------+



In [2]:
df_split = df.withColumn("info_split", split(col("info"), ","))
df_split.show(truncate=False)

+----------------+--------------------+
|info            |info_split          |
+----------------+--------------------+
|Alice,25,USA    |[Alice, 25, USA]    |
|Bob,30,UK       |[Bob, 30, UK]       |
|Charlie,35,India|[Charlie, 35, India]|
+----------------+--------------------+



# Indexing

In [None]:
df_final = df_split.withColumn("name", col("info_split")[0]) \
                   .withColumn("age", col("info_split")[1]) \
                   .withColumn("country", col("info_split")[2])
df_final.select('name','age','country').show()


+-------+---+-------+
|   name|age|country|
+-------+---+-------+
|  Alice| 25|    USA|
|    Bob| 30|     UK|
|Charlie| 35|  India|
+-------+---+-------+

