# WithColumn function in Pyspark

In [2]:
'''PySpark withColumn() is a transformation function of DataFrame which is used to change the value, convert the datatype of 
an existing column, create a new column, and many more'''

'PySpark withColumn() is a transformation function of DataFrame which is used to change the value, convert the datatype of \nan existing column, create a new column, and many more'

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()


In [3]:

data = [('James','xxx','Smith','1991-04-01','M','3000'),
  ('Michael','Rose','yyy','2000-05-19','M','4000'),
  ('Robert','aaa','Williams','1978-09-05','M','4000'),
  ('Maria','Anne','Jones','1967-12-01','F','4000'),
  ('Jen','Mary','Brown','1980-02-17','F','-1')]

columns = ["firstname","middlename","lastname","dob","gender","salary"]

df = spark.createDataFrame(data=data, schema=columns)

df.printSchema()
df.show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|       xxx|   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|     yyy|2000-05-19|     M|  4000|
|   Robert|       aaa|Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



# 1. Change DataType

In [4]:
from pyspark.sql.functions import col

df1 = df.withColumn("salary1",col("salary").cast("Integer"))

df1.show()


+---------+----------+--------+----------+------+------+-------+
|firstname|middlename|lastname|       dob|gender|salary|salary1|
+---------+----------+--------+----------+------+------+-------+
|    James|       xxx|   Smith|1991-04-01|     M|  3000|   3000|
|  Michael|      Rose|     yyy|2000-05-19|     M|  4000|   4000|
|   Robert|       aaa|Williams|1978-09-05|     M|  4000|   4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|   4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|     -1|
+---------+----------+--------+----------+------+------+-------+



# 2. Update The Value of an Existing Column

In [6]:

df.withColumn("salary",col("salary")*100).show()
df.withColumn("salary",col("salary")*100).printSchema()


+---------+----------+--------+----------+------+--------+
|firstname|middlename|lastname|       dob|gender|  salary|
+---------+----------+--------+----------+------+--------+
|    James|       xxx|   Smith|1991-04-01|     M|300000.0|
|  Michael|      Rose|     yyy|2000-05-19|     M|400000.0|
|   Robert|       aaa|Williams|1978-09-05|     M|400000.0|
|    Maria|      Anne|   Jones|1967-12-01|     F|400000.0|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -100.0|
+---------+----------+--------+----------+------+--------+

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)



# 3. Create a Column from an Existing Column

In [7]:

df.withColumn("CopiedColumn",col("salary")* -1).show()


+---------+----------+--------+----------+------+------+------------+
|firstname|middlename|lastname|       dob|gender|salary|CopiedColumn|
+---------+----------+--------+----------+------+------+------------+
|    James|       xxx|   Smith|1991-04-01|     M|  3000|     -3000.0|
|  Michael|      Rose|     yyy|2000-05-19|     M|  4000|     -4000.0|
|   Robert|       aaa|Williams|1978-09-05|     M|  4000|     -4000.0|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|     -4000.0|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|         1.0|
+---------+----------+--------+----------+------+------+------------+



# 4. Add a New Column

In [11]:
from pyspark.sql.functions import col, lit

df.withColumn("Country", lit("USA")).show()
df.withColumn("Country", lit("USA")) \
  .withColumn("anotherColumn",lit("anotherValue")) \
  .show()


+---------+----------+--------+----------+------+------+-------+
|firstname|middlename|lastname|       dob|gender|salary|Country|
+---------+----------+--------+----------+------+------+-------+
|    James|       xxx|   Smith|1991-04-01|     M|  3000|    USA|
|  Michael|      Rose|     yyy|2000-05-19|     M|  4000|    USA|
|   Robert|       aaa|Williams|1978-09-05|     M|  4000|    USA|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|    USA|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|    USA|
+---------+----------+--------+----------+------+------+-------+

+---------+----------+--------+----------+------+------+-------+-------------+
|firstname|middlename|lastname|       dob|gender|salary|Country|anotherColumn|
+---------+----------+--------+----------+------+------+-------+-------------+
|    James|       xxx|   Smith|1991-04-01|     M|  3000|    USA| anotherValue|
|  Michael|      Rose|     yyy|2000-05-19|     M|  4000|    USA| anotherValue|
|   Robert|       a

# 5. Rename Column Name

In [12]:
df.withColumnRenamed("gender","sex") \
  .show(truncate=False) 


+---------+----------+--------+----------+---+------+
|firstname|middlename|lastname|dob       |sex|salary|
+---------+----------+--------+----------+---+------+
|James    |xxx       |Smith   |1991-04-01|M  |3000  |
|Michael  |Rose      |yyy     |2000-05-19|M  |4000  |
|Robert   |aaa       |Williams|1978-09-05|M  |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F  |4000  |
|Jen      |Mary      |Brown   |1980-02-17|F  |-1    |
+---------+----------+--------+----------+---+------+



# WithColumnRenamed

In [14]:

df2 = df.withColumnRenamed("dob","DateOfBirth") \
        .withColumnRenamed("salary","salary_amount")
df2.printSchema()
df2.show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_amount: string (nullable = true)

+---------+----------+--------+-----------+------+-------------+
|firstname|middlename|lastname|DateOfBirth|gender|salary_amount|
+---------+----------+--------+-----------+------+-------------+
|    James|       xxx|   Smith| 1991-04-01|     M|         3000|
|  Michael|      Rose|     yyy| 2000-05-19|     M|         4000|
|   Robert|       aaa|Williams| 1978-09-05|     M|         4000|
|    Maria|      Anne|   Jones| 1967-12-01|     F|         4000|
|      Jen|      Mary|   Brown| 1980-02-17|     F|           -1|
+---------+----------+--------+-----------+------+-------------+



In [15]:
spark.stop()