In [0]:
from pyspark.sql.functions import monotonically_increasing_id, rand, expr, col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType

schema = StructType([
    StructField("employeeid", IntegerType(), False),
    StructField("departmentid", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("age", IntegerType(), False),
    StructField("gender", StringType(), False),
    StructField("salary", DoubleType(), False),
    StructField("hire_date", DateType(), False)
])

names = ["Alex", "Jordan", "Taylor", "Morgan", "Casey", "Riley", "Jamie", "Avery", "Peyton", "Quinn"]
genders = ["M", "F"]

base_df = spark.range(10000).withColumnRenamed("id", "employeeid")
df = (base_df
      .withColumn("departmentid", (rand()*10 + 1).cast("int"))
      .withColumn("name", expr(f"element_at(array({','.join([f'\'{n}\'' for n in names])}), cast(rand()*{len(names)}+1 as int))"))
      .withColumn("age", (rand()*30 + 20).cast("int"))
      .withColumn("gender", expr(f"element_at(array({','.join([f'\'{g}\'' for g in genders])}), cast(rand()*{len(genders)}+1 as int))"))
      .withColumn("salary", (rand()*70000 + 30000).cast("double"))
      .withColumn("hire_date", expr("date_add('2010-01-01', cast(rand()*5000 as int))"))
     )

df = df.select("employeeid", "departmentid", "name", "age", "gender", "salary", "hire_date")
display(df)

In [0]:
from pyspark.sql.functions import col,expr
empl_tab=df.filter(col("salary")>50000)
empl_tab.show()

In [0]:
empl_tab.display()

In [0]:
# To print the DataFrame Structure or Schema
# empl_tab.schema
empl_tab.printSchema()

In [0]:
# 1.Both gives the same result
#  col('salary')
# expr('gender')

# 2. to call a calumn from dataFrame use this
empl_tab.salary 
# empl_tab["salary"]

In [0]:
from pyspark.sql.functions import col,expr

emp_cast=empl_tab.select(expr("cast(salary as int) as sal"),col("name"))
emp_cast.display()

In [0]:
emp_cast.printSchema()

In [0]:
# to write the data
emp_cast.write.format("csv").save("location/employee.csv")
# to read the data
spark.read.table("employee").display()