In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.getOrCreate()

In [2]:
data = [
        ('James','Smith','M',3000),
        ('Anna','Rose','F',4100),
        ('Robert','Williams','M',6200)
       ]

columns = ["firstname","lastname","gender","salary"]

In [3]:
df = spark.createDataFrame(data=data,schema=columns)

In [4]:
df.show()

+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|  3000|
|     Anna|    Rose|     F|  4100|
|   Robert|Williams|     M|  6200|
+---------+--------+------+------+



In [9]:
# Add new constanct column

In [6]:
if 'bonus' not in df.columns:
    df = df.withColumn('bonus',lit(0.3))

In [7]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- bonus: double (nullable = false)



In [8]:
df.show()

+---------+--------+------+------+-----+
|firstname|lastname|gender|salary|bonus|
+---------+--------+------+------+-----+
|    James|   Smith|     M|  3000|  0.3|
|     Anna|    Rose|     F|  4100|  0.3|
|   Robert|Williams|     M|  6200|  0.3|
+---------+--------+------+------+-----+



In [10]:
#Add column from existing column

In [11]:
df = df.withColumn('bonus_amount',df['salary']*df['bonus'])
df.show()

+---------+--------+------+------+-----+------------+
|firstname|lastname|gender|salary|bonus|bonus_amount|
+---------+--------+------+------+-----+------------+
|    James|   Smith|     M|  3000|  0.3|       900.0|
|     Anna|    Rose|     F|  4100|  0.3|      1230.0|
|   Robert|Williams|     M|  6200|  0.3|      1860.0|
+---------+--------+------+------+-----+------------+



In [12]:
#Add column by concatinating existing columns

In [15]:
df.withColumn('Full Name',expr("firstname ||' '||lastname")).show()

+---------+--------+------+------+-----+------------+---------------+
|firstname|lastname|gender|salary|bonus|bonus_amount|      Full Name|
+---------+--------+------+------+-----+------------+---------------+
|    James|   Smith|     M|  3000|  0.3|       900.0|    James Smith|
|     Anna|    Rose|     F|  4100|  0.3|      1230.0|      Anna Rose|
|   Robert|Williams|     M|  6200|  0.3|      1860.0|Robert Williams|
+---------+--------+------+------+-----+------------+---------------+



In [16]:
df.withColumn('Full Name',concat_ws(' ',df['firstname'],df['lastname'])).show()

+---------+--------+------+------+-----+------------+---------------+
|firstname|lastname|gender|salary|bonus|bonus_amount|      Full Name|
+---------+--------+------+------+-----+------------+---------------+
|    James|   Smith|     M|  3000|  0.3|       900.0|    James Smith|
|     Anna|    Rose|     F|  4100|  0.3|      1230.0|      Anna Rose|
|   Robert|Williams|     M|  6200|  0.3|      1860.0|Robert Williams|
+---------+--------+------+------+-----+------------+---------------+



In [17]:
df.withColumn('FullName',concat(df.firstname,df.lastname)).show()

+---------+--------+------+------+-----+------------+--------------+
|firstname|lastname|gender|salary|bonus|bonus_amount|      FullName|
+---------+--------+------+------+-----+------------+--------------+
|    James|   Smith|     M|  3000|  0.3|       900.0|    JamesSmith|
|     Anna|    Rose|     F|  4100|  0.3|      1230.0|      AnnaRose|
|   Robert|Williams|     M|  6200|  0.3|      1860.0|RobertWilliams|
+---------+--------+------+------+-----+------------+--------------+



In [18]:
#Add current date

In [20]:
df.withColumn('current date',current_timestamp()).show()

+---------+--------+------+------+-----+------------+--------------------+
|firstname|lastname|gender|salary|bonus|bonus_amount|        current date|
+---------+--------+------+------+-----+------------+--------------------+
|    James|   Smith|     M|  3000|  0.3|       900.0|2021-09-02 22:34:...|
|     Anna|    Rose|     F|  4100|  0.3|      1230.0|2021-09-02 22:34:...|
|   Robert|Williams|     M|  6200|  0.3|      1860.0|2021-09-02 22:34:...|
+---------+--------+------+------+-----+------------+--------------------+



In [None]:
df.withColumn("grade", \
   when((df.salary < 4000), lit("A")) \
     .when((df.salary >= 4000) & (df.salary <= 5000), lit("B")) \
     .otherwise(lit("C")) \
  ).show()

In [21]:
df.withColumn('Grade',when(df.salary > 5000,lit('A')).when(df.salary > 4000,lit('B')).otherwise(lit('C'))).show()

+---------+--------+------+------+-----+------------+-----+
|firstname|lastname|gender|salary|bonus|bonus_amount|Grade|
+---------+--------+------+------+-----+------------+-----+
|    James|   Smith|     M|  3000|  0.3|       900.0|    C|
|     Anna|    Rose|     F|  4100|  0.3|      1230.0|    B|
|   Robert|Williams|     M|  6200|  0.3|      1860.0|    A|
+---------+--------+------+------+-----+------------+-----+



In [22]:
# Add column using select

In [23]:
df.select('firstname','salary',lit(0.3).alias('Bonus')).show()

+---------+------+-----+
|firstname|salary|Bonus|
+---------+------+-----+
|    James|  3000|  0.3|
|     Anna|  4100|  0.3|
|   Robert|  6200|  0.3|
+---------+------+-----+



In [26]:
df.select('firstname','salary',lit(df.salary * 0.3).alias('bonus_amount')).show()

+---------+------+------------+
|firstname|salary|bonus_amount|
+---------+------+------------+
|    James|  3000|       900.0|
|     Anna|  4100|      1230.0|
|   Robert|  6200|      1860.0|
+---------+------+------------+



In [27]:
df.select('firstname','salary',current_date().alias('Current Date')).show()

+---------+------+------------+
|firstname|salary|Current Date|
+---------+------+------------+
|    James|  3000|  2021-09-02|
|     Anna|  4100|  2021-09-02|
|   Robert|  6200|  2021-09-02|
+---------+------+------------+



In [28]:
#Add columns using SQL

In [29]:
df.createOrReplaceTempView('Employee')

In [31]:
spark.sql("select firstname,salary,0.3 as bonus from Employee").show()

+---------+------+-----+
|firstname|salary|bonus|
+---------+------+-----+
|    James|  3000|  0.3|
|     Anna|  4100|  0.3|
|   Robert|  6200|  0.3|
+---------+------+-----+



In [32]:
spark.sql("select firstname,salary, salary * 0.3 as salary_amount from Employee").show()

+---------+------+-------------+
|firstname|salary|salary_amount|
+---------+------+-------------+
|    James|  3000|        900.0|
|     Anna|  4100|       1230.0|
|   Robert|  6200|       1860.0|
+---------+------+-------------+



In [33]:
spark.sql("select firstname,salary,current_date as date from Employee").show()

+---------+------+----------+
|firstname|salary|      date|
+---------+------+----------+
|    James|  3000|2021-09-02|
|     Anna|  4100|2021-09-02|
|   Robert|  6200|2021-09-02|
+---------+------+----------+



In [41]:
spark.sql("select firstname,salary, case salary when salary < 4000 then 'A' else 'B' end as grade from Employee").show()

+---------+------+-----+
|firstname|salary|grade|
+---------+------+-----+
|    James|  3000|    B|
|     Anna|  4100|    B|
|   Robert|  6200|    B|
+---------+------+-----+

