In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession.builder.getOrCreate()

In [16]:
data = [
    ("James",34,"2006-01-01","true","M",3000.60),
    ("Michael",33,"1980-01-10","true","F",3300.80),
    ("Robert",37,"06-01-1992","false","M",5000.50)
]

col = ["firstname","age","jobStartDate","isGraduated","gender","salary"]

In [17]:
df = spark.createDataFrame(data=data, schema=col)
df.show()

+---------+---+------------+-----------+------+------+
|firstname|age|jobStartDate|isGraduated|gender|salary|
+---------+---+------------+-----------+------+------+
|    James| 34|  2006-01-01|       true|     M|3000.6|
|  Michael| 33|  1980-01-10|       true|     F|3300.8|
|   Robert| 37|  06-01-1992|      false|     M|5000.5|
+---------+---+------------+-----------+------+------+



In [18]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- age: long (nullable = true)
 |-- jobStartDate: string (nullable = true)
 |-- isGraduated: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)



# withColumn()

In [13]:
df = df.withColumn('age',df.age.cast('int')).withColumn('isGraduated',col('isGraduated').cast(BooleanType()))\
       .withColumn('jobStartDate',col('jobStartDate').cast(DateType()))
# df.withColumn('age',col('age').cast(IntegerType()))
# df.withColumn('age',col('age').cast('Integer'))

In [14]:
df.show()

+---------+---+------------+-----------+------+------+
|firstname|age|jobStartDate|isGraduated|gender|salary|
+---------+---+------------+-----------+------+------+
|    James| 34|  2006-01-01|       true|     M|3000.6|
|  Michael| 33|  1980-01-10|       true|     F|3300.8|
|   Robert| 37|        null|      false|     M|5000.5|
+---------+---+------------+-----------+------+------+



In [15]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- jobStartDate: date (nullable = true)
 |-- isGraduated: boolean (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)



# select

In [24]:
df.show()

+---------+---+------------+-----------+------+------+
|firstname|age|jobStartDate|isGraduated|gender|salary|
+---------+---+------------+-----------+------+------+
|    James| 34|  2006-01-01|       true|     M|3000.6|
|  Michael| 33|  1980-01-10|       true|     F|3300.8|
|   Robert| 37|  06-01-1992|      false|     M|5000.5|
+---------+---+------------+-----------+------+------+



In [28]:
df.select(df['age'].cast(IntegerType())).show()

+---+
|age|
+---+
| 34|
| 33|
| 37|
+---+



# selectExpr()

In [33]:
df2 = df.selectExpr('cast(age as int) age','cast(isGraduated as string) isGraduated')

In [34]:
df2.printSchema()

root
 |-- age: integer (nullable = true)
 |-- isGraduated: string (nullable = true)

