In [0]:
# import SparkSession from spark SQL

from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName('Basic Transformation - I')
    .getOrCreate()
)

In [0]:
spark

In [0]:
# emp data and schema

emp_data = [
    ['001', '101', 'Alice Johnson', '29', 'Female', '52000', '2016-03-15'],
    ['002', '102', 'Bob Smith', '34', 'Male', '48000', '2017-07-20'],
    ['003', '103', 'Charlie Brown', '26', 'Male', '47000', '2018-02-01'],
    ['004', '104', 'Diana Prince', '31', 'Female', '55000', '2015-11-12'],
    ['005', '105', 'Ethan Hunt', '28', 'Male', '49500', '2019-05-08'],
    ['006', '106', 'Fiona Gallagher', '27', 'Female', '51000', '2016-09-10'],
    ['007', '107', 'George Costanza', '33', 'Male', '46000', '2015-01-01'],
    ['008', '108', 'Hannah Montana', '25', 'Female', '53000', '2017-04-14'],
    ['009', '109', 'Isaac Newton', '35', 'Male', '60000', '2018-10-25'],
    ['010', '110', 'Jessica Jones', '32', 'Female', '58000', '2016-06-30'],
    ['011', '111', 'Kevin Malone', '36', 'Male', '49000', '2015-03-18'],
    ['012', '112', 'Laura Croft', '30', 'Female', '57000', '2017-08-22'],
    ['013', '113', 'Mike Ross', '29', 'Male', '51500', '2019-01-05'],
    ['014', '114', 'Nancy Drew', '27', 'Female', '54000', '2016-12-09'],
    ['015', '115', 'Oscar Martinez', '28', 'Male', '45500', '2018-03-19'],
    ['016', '116', 'Pam Beesly', '31', 'Female', '47500', '2019-07-13'],
    ['017', '117', 'Quinn Fabray', '26', 'Female', '51000', '2015-11-05'],
    ['018', '118', 'Ryan Howard', '34', 'Male', '49000', '2016-04-21'],
    ['019', '119', 'Sophia Loren', '30', 'Female', '56000', '2018-09-17'],
    ['020', '120', 'Toby Flenderson', '35', 'Male', '45000', '2017-02-08']
]

emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

In [0]:
emp = spark.createDataFrame(data=emp_data, schema=emp_schema)

In [0]:
emp.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)



In [0]:
# casting column
# select employee_id, name, age, cast(salary as double) as salary from emp

from pyspark.sql.functions import col, cast

emp_casted = emp.select('employee_id', 'name', 'age', col('salary').cast('double'))

In [0]:
emp_casted.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: double (nullable = true)



In [0]:
# adding columns
# select employee_id, name, age, salary, (salary * 0.2) as tax from emp_casted

emp_taxed = emp_casted.withColumn('tax', col('salary')*0.2)

In [0]:
emp_taxed.show(5)

+-----------+-------------+---+-------+-------+
|employee_id|         name|age| salary|    tax|
+-----------+-------------+---+-------+-------+
|        001|Alice Johnson| 29|52000.0|10400.0|
|        002|    Bob Smith| 34|48000.0| 9600.0|
|        003|Charlie Brown| 26|47000.0| 9400.0|
|        004| Diana Prince| 31|55000.0|11000.0|
|        005|   Ethan Hunt| 28|49500.0| 9900.0|
+-----------+-------------+---+-------+-------+
only showing top 5 rows



In [0]:
# Literals
# select employee_id, name, age, salary, tax, 1 as columnOne, 'two' as columnTwo from emp_taxed
from pyspark.sql.functions import lit

emp_new_cols = emp_taxed.withColumn('columnOne', lit(1)).withColumn('columnTwo', lit('two'))

In [0]:
emp_new_cols.show(5)

+-----------+-------------+---+-------+-------+---------+---------+
|employee_id|         name|age| salary|    tax|columnOne|columnTwo|
+-----------+-------------+---+-------+-------+---------+---------+
|        001|Alice Johnson| 29|52000.0|10400.0|        1|      two|
|        002|    Bob Smith| 34|48000.0| 9600.0|        1|      two|
|        003|Charlie Brown| 26|47000.0| 9400.0|        1|      two|
|        004| Diana Prince| 31|55000.0|11000.0|        1|      two|
|        005|   Ethan Hunt| 28|49500.0| 9900.0|        1|      two|
+-----------+-------------+---+-------+-------+---------+---------+
only showing top 5 rows



In [0]:
# Renaming columns
# select emp_id, name, age, salary, tax, columnOne, columnTwo from emp_new_cols
emp_1 = emp_new_cols.withColumnRenamed('employee_id', 'emp_id')


In [0]:
emp_1.printSchema()

root
 |-- emp_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- tax: double (nullable = true)
 |-- columnOne: integer (nullable = false)
 |-- columnTwo: string (nullable = false)



In [0]:
# Column name with spaces
# select emp_id, name, age, salary, columnOne, columnTwo as 'Column Two' from emp_new_cols
emp_2 = emp_new_cols.withColumnRenamed('columnTwo', 'Column Two')

In [0]:
emp_2.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- tax: double (nullable = true)
 |-- columnOne: integer (nullable = false)
 |-- Column Two: string (nullable = false)



In [0]:
# Remove columns
emp_dropped = emp_new_cols.drop('columnTwo', 'columnOne')

In [0]:
emp_dropped.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- tax: double (nullable = true)



In [0]:
# Filter data
# select employee_id, name, age, salary, tax from emp_dropped where salary > 50000
emp_filtered = emp_dropped.where('salary > 50000')

In [0]:
emp_filtered.show()

+-----------+---------------+---+-------+-------+
|employee_id|           name|age| salary|    tax|
+-----------+---------------+---+-------+-------+
|        001|  Alice Johnson| 29|52000.0|10400.0|
|        004|   Diana Prince| 31|55000.0|11000.0|
|        006|Fiona Gallagher| 27|51000.0|10200.0|
|        008| Hannah Montana| 25|53000.0|10600.0|
|        009|   Isaac Newton| 35|60000.0|12000.0|
|        010|  Jessica Jones| 32|58000.0|11600.0|
|        012|    Laura Croft| 30|57000.0|11400.0|
|        013|      Mike Ross| 29|51500.0|10300.0|
|        014|     Nancy Drew| 27|54000.0|10800.0|
|        017|   Quinn Fabray| 26|51000.0|10200.0|
|        019|   Sophia Loren| 30|56000.0|11200.0|
+-----------+---------------+---+-------+-------+



In [0]:
# Limit columns
# select employee_id, name, age, salary, tax from emp_filtered limit 5
emp_limit = emp_filtered.limit(5)

In [0]:
emp_limit.show()

+-----------+---------------+---+-------+-------+
|employee_id|           name|age| salary|    tax|
+-----------+---------------+---+-------+-------+
|        001|  Alice Johnson| 29|52000.0|10400.0|
|        004|   Diana Prince| 31|55000.0|11000.0|
|        006|Fiona Gallagher| 27|51000.0|10200.0|
|        008| Hannah Montana| 25|53000.0|10600.0|
|        009|   Isaac Newton| 35|60000.0|12000.0|
+-----------+---------------+---+-------+-------+



In [0]:
# Add multiple columns
columns = {
    'tax': col('salary') * 0.2,
    'oneNumber': lit(1),
    'columnTwo': lit('Two')
}

emp_final = emp.withColumns(columns)

In [0]:
emp_final.show(5)

+-----------+-------------+-------------+---+------+------+----------+-------+---------+---------+
|employee_id|department_id|         name|age|gender|salary| hire_date|    tax|oneNumber|columnTwo|
+-----------+-------------+-------------+---+------+------+----------+-------+---------+---------+
|        001|          101|Alice Johnson| 29|Female| 52000|2016-03-15|10400.0|        1|      Two|
|        002|          102|    Bob Smith| 34|  Male| 48000|2017-07-20| 9600.0|        1|      Two|
|        003|          103|Charlie Brown| 26|  Male| 47000|2018-02-01| 9400.0|        1|      Two|
|        004|          104| Diana Prince| 31|Female| 55000|2015-11-12|11000.0|        1|      Two|
|        005|          105|   Ethan Hunt| 28|  Male| 49500|2019-05-08| 9900.0|        1|      Two|
+-----------+-------------+-------------+---+------+------+----------+-------+---------+---------+
only showing top 5 rows

