# WithColumn function in Pyspark

In [2]:
'''PySpark withColumn() is a transformation function of DataFrame which is used to change the value, convert the datatype of 
   an existing column, create a new column, and many more'''

'PySpark withColumn() is a transformation function of DataFrame which is used to change the value, convert the datatype of \n   an existing column, create a new column, and many more'

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()


In [5]:

data = [('James','xxx','Smith','1991-04-01','M','3000'),
  ('Michael','Rose','yyy','2000-05-19','M','4000'),
  ('Robert','aaa','Williams','1978-09-05','M','4000'),
  ('Maria','Anne','Jones','1967-12-01','F','4000'),
  ('Jen','Mary','Brown','1980-02-17','F','-1')]

columns = ["firstname","middlename","lastname","dob","gender","salary"]

df = spark.createDataFrame(data=data, schema=columns)

df.printSchema()
df.show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|       xxx|   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|     yyy|2000-05-19|     M|  4000|
|   Robert|       aaa|Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



# 1. Change DataType

In [6]:
from pyspark.sql.functions import col

df1 = df.withColumn("salary1",col("salary").cast("Integer"))

df1.show()
df1.printSchema()

#--->df.na.fill(0)
#--->df.na.drop()

+---------+----------+--------+----------+------+------+-------+
|firstname|middlename|lastname|       dob|gender|salary|salary1|
+---------+----------+--------+----------+------+------+-------+
|    James|       xxx|   Smith|1991-04-01|     M|  3000|   3000|
|  Michael|      Rose|     yyy|2000-05-19|     M|  4000|   4000|
|   Robert|       aaa|Williams|1978-09-05|     M|  4000|   4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|   4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|     -1|
+---------+----------+--------+----------+------+------+-------+

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- salary1: integer (nullable = true)



# 2. Update The Value of an Existing Column

In [7]:

df1.withColumn("salary1",col("salary1")*100).show()
df1.withColumn("salary1",col("salary1")*100).printSchema()

df.withColumn()


+---------+----------+--------+----------+------+------+-------+
|firstname|middlename|lastname|       dob|gender|salary|salary1|
+---------+----------+--------+----------+------+------+-------+
|    James|       xxx|   Smith|1991-04-01|     M|  3000| 300000|
|  Michael|      Rose|     yyy|2000-05-19|     M|  4000| 400000|
|   Robert|       aaa|Williams|1978-09-05|     M|  4000| 400000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000| 400000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|   -100|
+---------+----------+--------+----------+------+------+-------+

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- salary1: integer (nullable = true)



# 3. Create a Column from an Existing Column

In [13]:
from pyspark.sql.functions import concat, col, lit

#df.withColumn("CopiedColumn",col("salary")* -1).show()
df.withColumn("Name", concat(col("firstname"),lit('-'),col("lastname"))).show()

#help(df.concat)

+---------+----------+--------+----------+------+------+---------------+
|firstname|middlename|lastname|       dob|gender|salary|           Name|
+---------+----------+--------+----------+------+------+---------------+
|    James|       xxx|   Smith|1991-04-01|     M|  3000|    James-Smith|
|  Michael|      Rose|     yyy|2000-05-19|     M|  4000|    Michael-yyy|
|   Robert|       aaa|Williams|1978-09-05|     M|  4000|Robert-Williams|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|    Maria-Jones|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|      Jen-Brown|
+---------+----------+--------+----------+------+------+---------------+



# 4. Add a New Column

In [15]:
from pyspark.sql.functions import col, lit

#df.withColumn("Country", lit("USA")).show()
df.withColumn("Country", lit("USA")) \
  .withColumn("DEPT",lit("IT")) \
  .show()


+---------+----------+--------+----------+------+------+-------+----+
|firstname|middlename|lastname|       dob|gender|salary|Country|DEPT|
+---------+----------+--------+----------+------+------+-------+----+
|    James|       xxx|   Smith|1991-04-01|     M|  3000|    USA|  IT|
|  Michael|      Rose|     yyy|2000-05-19|     M|  4000|    USA|  IT|
|   Robert|       aaa|Williams|1978-09-05|     M|  4000|    USA|  IT|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|    USA|  IT|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|    USA|  IT|
+---------+----------+--------+----------+------+------+-------+----+



# 5. Rename Column Name

In [16]:
df.withColumnRenamed("gender","sex") \
  .show(truncate=False) 


+---------+----------+--------+----------+---+------+
|firstname|middlename|lastname|dob       |sex|salary|
+---------+----------+--------+----------+---+------+
|James    |xxx       |Smith   |1991-04-01|M  |3000  |
|Michael  |Rose      |yyy     |2000-05-19|M  |4000  |
|Robert   |aaa       |Williams|1978-09-05|M  |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F  |4000  |
|Jen      |Mary      |Brown   |1980-02-17|F  |-1    |
+---------+----------+--------+----------+---+------+



# WithColumnRenamed

In [17]:

df2 = df.withColumnRenamed("dob","DateOfBirth") \
        .withColumnRenamed("salary","salary_amount")
df2.printSchema()
df2.show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_amount: string (nullable = true)

+---------+----------+--------+-----------+------+-------------+
|firstname|middlename|lastname|DateOfBirth|gender|salary_amount|
+---------+----------+--------+-----------+------+-------------+
|    James|       xxx|   Smith| 1991-04-01|     M|         3000|
|  Michael|      Rose|     yyy| 2000-05-19|     M|         4000|
|   Robert|       aaa|Williams| 1978-09-05|     M|         4000|
|    Maria|      Anne|   Jones| 1967-12-01|     F|         4000|
|      Jen|      Mary|   Brown| 1980-02-17|     F|           -1|
+---------+----------+--------+-----------+------+-------------+



In [18]:
spark.stop()