In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.csv("emp_data.csv", inferSchema=True,
                   header=True)

In [4]:
df.show()

+----+--------+--------+
|year|    dept|  salary|
+----+--------+--------+
|2004|      IT|    2004|
|2004|      IT|62022432|
|2004|Accounts| 4409580|
|2004|      HR|  211648|
|2004|   Sales|  902053|
|2004|      IT| 1005417|
|2004|Accounts|    3645|
|2005|      IT| 2974005|
|2005|      IT|62275153|
|2005|Accounts| 4239440|
|2005|      HR|  114120|
|2005|   Sales| 1215112|
|2005|      IT|  773217|
|2005|Accounts|    3101|
|2006|      IT| 3285138|
|2006|      IT|65932248|
|2006|Accounts| 4259693|
|2006|      HR|   67236|
|2006|   Sales|  836424|
|2006|      IT| 1095922|
+----+--------+--------+
only showing top 20 rows



In [5]:
df.withColumn("Name",F.expr(" year ||','|| dept")).show()

+----+--------+--------+-------------+
|year|    dept|  salary|         Name|
+----+--------+--------+-------------+
|2004|      IT|    2004|      2004,IT|
|2004|      IT|62022432|      2004,IT|
|2004|Accounts| 4409580|2004,Accounts|
|2004|      HR|  211648|      2004,HR|
|2004|   Sales|  902053|   2004,Sales|
|2004|      IT| 1005417|      2004,IT|
|2004|Accounts|    3645|2004,Accounts|
|2005|      IT| 2974005|      2005,IT|
|2005|      IT|62275153|      2005,IT|
|2005|Accounts| 4239440|2005,Accounts|
|2005|      HR|  114120|      2005,HR|
|2005|   Sales| 1215112|   2005,Sales|
|2005|      IT|  773217|      2005,IT|
|2005|Accounts|    3101|2005,Accounts|
|2006|      IT| 3285138|      2006,IT|
|2006|      IT|65932248|      2006,IT|
|2006|Accounts| 4259693|2006,Accounts|
|2006|      HR|   67236|      2006,HR|
|2006|   Sales|  836424|   2006,Sales|
|2006|      IT| 1095922|      2006,IT|
+----+--------+--------+-------------+
only showing top 20 rows



In [6]:
df.withColumn("Department", F.expr("CASE WHEN dept = 'HR' THEN 'Human Resource' " +
           "WHEN dept = 'IT' THEN 'Information Technology' ELSE 'unknown' END")).show(truncate=False)

+----+--------+--------+----------------------+
|year|dept    |salary  |Department            |
+----+--------+--------+----------------------+
|2004|IT      |2004    |Information Technology|
|2004|IT      |62022432|Information Technology|
|2004|Accounts|4409580 |unknown               |
|2004|HR      |211648  |Human Resource        |
|2004|Sales   |902053  |unknown               |
|2004|IT      |1005417 |Information Technology|
|2004|Accounts|3645    |unknown               |
|2005|IT      |2974005 |Information Technology|
|2005|IT      |62275153|Information Technology|
|2005|Accounts|4239440 |unknown               |
|2005|HR      |114120  |Human Resource        |
|2005|Sales   |1215112 |unknown               |
|2005|IT      |773217  |Information Technology|
|2005|Accounts|3101    |unknown               |
|2006|IT      |3285138 |Information Technology|
|2006|IT      |65932248|Information Technology|
|2006|Accounts|4259693 |unknown               |
|2006|HR      |67236   |Human Resource  

In [7]:
df.select(df.dept,df.salary,
     F.expr("salary + 10000 as new_salary")
  ).show()

+--------+--------+----------+
|    dept|  salary|new_salary|
+--------+--------+----------+
|      IT|    2004|     12004|
|      IT|62022432|  62032432|
|Accounts| 4409580|   4419580|
|      HR|  211648|    221648|
|   Sales|  902053|    912053|
|      IT| 1005417|   1015417|
|Accounts|    3645|     13645|
|      IT| 2974005|   2984005|
|      IT|62275153|  62285153|
|Accounts| 4239440|   4249440|
|      HR|  114120|    124120|
|   Sales| 1215112|   1225112|
|      IT|  773217|    783217|
|Accounts|    3101|     13101|
|      IT| 3285138|   3295138|
|      IT|65932248|  65942248|
|Accounts| 4259693|   4269693|
|      HR|   67236|     77236|
|   Sales|  836424|    846424|
|      IT| 1095922|   1105922|
+--------+--------+----------+
only showing top 20 rows



In [8]:
df.filter(F.expr("year == salary")).show()

+----+----+------+
|year|dept|salary|
+----+----+------+
|2004|  IT|  2004|
+----+----+------+

