In PySpark, the withColumn() function is used to add a new column or update an existing column in a DataFrame. Here's a detailed explanation with sample DataFrame and code examples.

In [0]:
from pyspark.sql.functions import col, lit, when

data = [
    (1, "Alice", 2000),
    (2, "Bob", 3000),
    (3, "Charlie", 1500)
]

columns = ["id", "name", "salary"]

df = spark.createDataFrame(data, columns)
df.display()


id,name,salary
1,Alice,2000
2,Bob,3000
3,Charlie,1500


In [0]:
from pyspark.sql.functions import lit

df_new = df.withColumn("country", lit("India"))
df_new.display()


id,name,salary,country
1,Alice,2000,India
2,Bob,3000,India
3,Charlie,1500,India


In [0]:
df_updated = df.withColumn("salary", col("salary") * 1.1)
df_updated.display()


id,name,salary
1,Alice,2200.0
2,Bob,3300.0000000000005
3,Charlie,1650.0000000000002


In [0]:
df_status = df.withColumn(
    "status", 
    when(col("salary") > 2500, "High").otherwise("Low")
)
df_status.display()


id,name,salary,status
1,Alice,2000,Low
2,Bob,3000,High
3,Charlie,1500,Low
