In [0]:
# import SparkSession from spark SQL

from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName('Basic Transformation - I')
    .getOrCreate()
)

In [0]:
spark

In [0]:
# emp data and schema

emp_data = [
    ['001', '101', 'Alice Johnson', '29', 'Female', '52000', '2016-03-15'],
    ['002', '102', 'Bob Smith', '34', 'Male', '48000', '2017-07-20'],
    ['003', '103', 'Charlie Brown', '26', 'Male', '47000', '2018-02-01'],
    ['004', '104', 'Diana Prince', '31', 'Female', '55000', '2015-11-12'],
    ['005', '105', 'Ethan Hunt', '28', 'Male', '49500', '2019-05-08'],
    ['006', '106', 'Fiona Gallagher', '27', 'Female', '51000', '2016-09-10'],
    ['007', '107', 'George Costanza', '33', 'Male', '46000', '2015-01-01'],
    ['008', '108', 'Hannah Montana', '25', 'Female', '53000', '2017-04-14'],
    ['009', '109', 'Isaac Newton', '35', 'Male', '60000', '2018-10-25'],
    ['010', '110', 'Jessica Jones', '32', 'Female', '58000', '2016-06-30'],
    ['011', '111', 'Kevin Malone', '36', 'Male', '49000', '2015-03-18'],
    ['012', '112', 'Laura Croft', '30', 'Female', '57000', '2017-08-22'],
    ['013', '113', 'Mike Ross', '29', 'Male', '51500', '2019-01-05'],
    ['014', '114', 'Nancy Drew', '27', 'Female', '54000', '2016-12-09'],
    ['015', '115', 'Oscar Martinez', '28', 'Male', '45500', '2018-03-19'],
    ['016', '116', 'Pam Beesly', '31', 'Female', '47500', '2019-07-13'],
    ['017', '117', 'Quinn Fabray', '26', 'Female', '51000', '2015-11-05'],
    ['018', '118', 'Ryan Howard', '34', 'Male', '49000', '2016-04-21'],
    ['019', '119', 'Sophia Loren', '30', 'Female', '56000', '2018-09-17'],
    ['020', '120', 'Toby Flenderson', '35', 'Male', '45000', '2017-02-08']
]

emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

In [0]:
emp = spark.createDataFrame(data=emp_data, schema=emp_schema)

In [0]:
emp.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)



In [0]:
# spark schema native form

emp.schema

StructType([StructField('employee_id', StringType(), True), StructField('department_id', StringType(), True), StructField('name', StringType(), True), StructField('age', StringType(), True), StructField('gender', StringType(), True), StructField('salary', StringType(), True), StructField('hire_date', StringType(), True)])

In [0]:
# small example for schema

from pyspark.sql.types import StructType, StructField, StringType, IntegerType
schema_string = 'name string, age int'

schema_spark = StructType([
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),                      
    ])

schema_spark

StructType([StructField('name', StringType(), True), StructField('age', IntegerType(), True)])

In [0]:
# column and expression
# call the column from dataframe

from pyspark.sql.functions import col, expr
col('name')
# expr('name')
# emp['salary']

Column<'name'>

In [0]:
# SELECT columns
# select employee_id, name, age, salary from emp

emp_filtered = emp.select(col('employee_id'), expr('name'), emp.age, emp['salary']) # call the column in different method

In [0]:
emp_filtered.show()

+-----------+---------------+---+------+
|employee_id|           name|age|salary|
+-----------+---------------+---+------+
|        001|  Alice Johnson| 29| 52000|
|        002|      Bob Smith| 34| 48000|
|        003|  Charlie Brown| 26| 47000|
|        004|   Diana Prince| 31| 55000|
|        005|     Ethan Hunt| 28| 49500|
|        006|Fiona Gallagher| 27| 51000|
|        007|George Costanza| 33| 46000|
|        008| Hannah Montana| 25| 53000|
|        009|   Isaac Newton| 35| 60000|
|        010|  Jessica Jones| 32| 58000|
|        011|   Kevin Malone| 36| 49000|
|        012|    Laura Croft| 30| 57000|
|        013|      Mike Ross| 29| 51500|
|        014|     Nancy Drew| 27| 54000|
|        015| Oscar Martinez| 28| 45500|
|        016|     Pam Beesly| 31| 47500|
|        017|   Quinn Fabray| 26| 51000|
|        018|    Ryan Howard| 34| 49000|
|        019|   Sophia Loren| 30| 56000|
|        020|Toby Flenderson| 35| 45000|
+-----------+---------------+---+------+



In [0]:
# using expr for select
# select employee_id as emp_id, name, cast(age as int) as age, salary from emp_filtered

emp_casted = emp_filtered.select(expr('employee_id as emp_id'), emp.name, expr('cast(age as int) as age'), emp.salary)

In [0]:
emp_casted.show()

+------+---------------+---+------+
|emp_id|           name|age|salary|
+------+---------------+---+------+
|   001|  Alice Johnson| 29| 52000|
|   002|      Bob Smith| 34| 48000|
|   003|  Charlie Brown| 26| 47000|
|   004|   Diana Prince| 31| 55000|
|   005|     Ethan Hunt| 28| 49500|
|   006|Fiona Gallagher| 27| 51000|
|   007|George Costanza| 33| 46000|
|   008| Hannah Montana| 25| 53000|
|   009|   Isaac Newton| 35| 60000|
|   010|  Jessica Jones| 32| 58000|
|   011|   Kevin Malone| 36| 49000|
|   012|    Laura Croft| 30| 57000|
|   013|      Mike Ross| 29| 51500|
|   014|     Nancy Drew| 27| 54000|
|   015| Oscar Martinez| 28| 45500|
|   016|     Pam Beesly| 31| 47500|
|   017|   Quinn Fabray| 26| 51000|
|   018|    Ryan Howard| 34| 49000|
|   019|   Sophia Loren| 30| 56000|
|   020|Toby Flenderson| 35| 45000|
+------+---------------+---+------+



In [0]:
emp_casted_1 = emp_filtered.selectExpr('employee_id as emp_id','name','cast(age as int) as age','salary')

In [0]:
emp_casted_1.show(5)

+------+-------------+---+------+
|emp_id|         name|age|salary|
+------+-------------+---+------+
|   001|Alice Johnson| 29| 52000|
|   002|    Bob Smith| 34| 48000|
|   003|Charlie Brown| 26| 47000|
|   004| Diana Prince| 31| 55000|
|   005|   Ethan Hunt| 28| 49500|
+------+-------------+---+------+
only showing top 5 rows



In [0]:
emp_casted.printSchema()

root
 |-- emp_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: string (nullable = true)



In [0]:
# filter emp based on Age > 30
# select emp_id, name, age, salary from emp_casted where age > 30

emp_final = emp_casted.select('emp_id','name','age','salary').where('age > 30')

In [0]:
emp_final.show()

+------+---------------+---+------+
|emp_id|           name|age|salary|
+------+---------------+---+------+
|   002|      Bob Smith| 34| 48000|
|   004|   Diana Prince| 31| 55000|
|   007|George Costanza| 33| 46000|
|   009|   Isaac Newton| 35| 60000|
|   010|  Jessica Jones| 32| 58000|
|   011|   Kevin Malone| 36| 49000|
|   016|     Pam Beesly| 31| 47500|
|   018|    Ryan Howard| 34| 49000|
|   020|Toby Flenderson| 35| 45000|
+------+---------------+---+------+



In [0]:
# write the data back to csv

# emp_final.write.format('csv').save('data/output/2/emp2.csv')

In [0]:
%sh
pwd

/Workspace/Users/weerapat.somtua@outlook.com/notebooks


In [0]:
# convert spark native schema

schema_str = 'name string, age int'

from pyspark.sql.types import _parse_datatype_string

schema_spark = _parse_datatype_string(schema_str)

schema_spark

StructType([StructField('name', StringType(), True), StructField('age', IntegerType(), True)])