<a href="https://colab.research.google.com/github/rahulrajpr/prepare-anytime/blob/main/spark/functions/18_spark_dataframe_columns_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Spark DataFrame Column Methods**
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Column.html#

In [3]:
# Install Java and PySpark

import warnings
warnings.filterwarnings('ignore')

!apt-get update -qq
!apt-get install -y openjdk-11-jdk-headless -qq > /dev/null
!pip install pyspark -q

# Set Java home
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

import pyspark
print(pyspark.__version__)

!pip install "numpy<2.0"

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
3.5.1


In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark-dataframe').getOrCreate()

In [5]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, DateType
from pyspark.sql.functions import col, lit
from datetime import datetime, date

data = [
    # Complete records
    (1, "Alice", "Engineering", 75000.50, 28, True, date(2020, 1, 15), "New York"),
    (2, "Bob", "Marketing", 65000.75, 32, False, date(2019, 3, 20), "San Francisco"),
    (3, "Charlie", "Engineering", 82000.25, 35, True, date(2018, 7, 10), "New York"),
    (4, "Diana", "Sales", 58000.00, 29, True, date(2021, 5, 5), "Chicago"),
    (5, "Eve", "HR", 62000.80, 31, False, date(2020, 11, 30), "Boston"),

    # Records with some null values
    (6, "Frank", None, 71000.40, 40, True, date(2017, 8, 25), None),
    (7, None, "Engineering", 68000.60, 27, False, date(2022, 2, 14), "Seattle"),
    (8, "Grace", "Marketing", None, 33, True, date(2019, 9, 8), "Austin"),
    (9, "Henry", "Sales", 59000.90, None, False, date(2021, 12, 1), "Denver"),
    (10, "Ivy", "HR", 63000.30, 36, None, None, "Portland"),

    # Edge cases
    (11, "", "Engineering", 0.0, 0, False, date(2023, 1, 1), ""),
    (12, "Jack", "Sales", 1000000.99, 99, True, date(2015, 12, 31), "Miami"),
    (13, "Karen", "Marketing", 45000.00, 22, True, date(2023, 6, 15), "Atlanta"),
    (14, "Leo", "Engineering", 95000.00, 45, False, date(2016, 4, 18), "New York"),
    (15, "Mona", None, 52000.50, 26, True, date(2022, 8, 9), "Chicago")
]

# Define schema
schema = StructType([
    StructField("employee_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("salary", DoubleType(), True),
    StructField("age", IntegerType(), True),
    StructField("is_active", BooleanType(), True),
    StructField("hire_date", DateType(), True),
    StructField("city", StringType(), True)
])

dataframe = spark.createDataFrame(data, schema)
dataframe.printSchema()
dataframe.show(truncate = False)

root
 |-- employee_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- is_active: boolean (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- city: string (nullable = true)

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75  |32  |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25  |35  |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0   |29  |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-11-30|Boston       |
|

In [14]:
# alias

from pyspark.sql.functions import expr

dataframe.select(expr('name').alias('empName')).show(truncate=False)

+-------+
|empName|
+-------+
|Alice  |
|Bob    |
|Charlie|
|Diana  |
|Eve    |
|Frank  |
|NULL   |
|Grace  |
|Henry  |
|Ivy    |
|       |
|Jack   |
|Karen  |
|Leo    |
|Mona   |
+-------+



In [20]:
# asc

dataframe.orderBy(col('name').asc()).show(truncate=False)

dataframe.orderBy(col('department').asc(), col('name').asc()).show(truncate = False)

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|7          |NULL   |Engineering|68000.6   |27  |false    |2022-02-14|Seattle      |
|11         |       |Engineering|0.0       |0   |false    |2023-01-01|             |
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75  |32  |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25  |35  |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0   |29  |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-11-30|Boston       |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-08-25|NULL         |
|8          |Grace  |Marketing  |NULL      |33  |true     |2019-0

In [21]:
# asc_nulls_first

dataframe.orderBy(col('name').asc_nulls_first()).show(truncate=False)

dataframe.orderBy(col('department').asc_nulls_first(), col('name').asc_nulls_first()).show(truncate = False)

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|7          |NULL   |Engineering|68000.6   |27  |false    |2022-02-14|Seattle      |
|11         |       |Engineering|0.0       |0   |false    |2023-01-01|             |
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75  |32  |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25  |35  |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0   |29  |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-11-30|Boston       |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-08-25|NULL         |
|8          |Grace  |Marketing  |NULL      |33  |true     |2019-0

In [22]:
# asc_nulls_last

dataframe.orderBy(col('name').asc_nulls_last()).show(truncate=False)

dataframe.orderBy(col('department').asc_nulls_last(), col('name').asc_nulls_last()).show(truncate = False)

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|11         |       |Engineering|0.0       |0   |false    |2023-01-01|             |
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |
|2          |Bob    |Marketing  |65000.75  |32  |false    |2019-03-20|San Francisco|
|3          |Charlie|Engineering|82000.25  |35  |true     |2018-07-10|New York     |
|4          |Diana  |Sales      |58000.0   |29  |true     |2021-05-05|Chicago      |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-11-30|Boston       |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-08-25|NULL         |
|8          |Grace  |Marketing  |NULL      |33  |true     |2019-09-08|Austin       |
|9          |Henry  |Sales      |59000.9   |NULL|false    |2021-1

In [23]:
# desc

dataframe.orderBy(col('name').desc()).show(truncate=False)

dataframe.orderBy(col('department').desc(), col('name').desc()).show(truncate = False)

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|15         |Mona   |NULL       |52000.5   |26  |true     |2022-08-09|Chicago      |
|14         |Leo    |Engineering|95000.0   |45  |false    |2016-04-18|New York     |
|13         |Karen  |Marketing  |45000.0   |22  |true     |2023-06-15|Atlanta      |
|12         |Jack   |Sales      |1000000.99|99  |true     |2015-12-31|Miami        |
|10         |Ivy    |HR         |63000.3   |36  |NULL     |NULL      |Portland     |
|9          |Henry  |Sales      |59000.9   |NULL|false    |2021-12-01|Denver       |
|8          |Grace  |Marketing  |NULL      |33  |true     |2019-09-08|Austin       |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-08-25|NULL         |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-1

In [28]:
# desc_nulls_first

dataframe.orderBy(col('name').desc_nulls_first()).show(truncate=False)

dataframe.orderBy(col('department').desc_nulls_first(), col('name').desc_nulls_first()).show(truncate = False)

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|7          |NULL   |Engineering|68000.6   |27  |false    |2022-02-14|Seattle      |
|15         |Mona   |NULL       |52000.5   |26  |true     |2022-08-09|Chicago      |
|14         |Leo    |Engineering|95000.0   |45  |false    |2016-04-18|New York     |
|13         |Karen  |Marketing  |45000.0   |22  |true     |2023-06-15|Atlanta      |
|12         |Jack   |Sales      |1000000.99|99  |true     |2015-12-31|Miami        |
|10         |Ivy    |HR         |63000.3   |36  |NULL     |NULL      |Portland     |
|9          |Henry  |Sales      |59000.9   |NULL|false    |2021-12-01|Denver       |
|8          |Grace  |Marketing  |NULL      |33  |true     |2019-09-08|Austin       |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-0

In [29]:
# desc_nulls_last

dataframe.orderBy(col('name').desc_nulls_last()).show(truncate=False)

dataframe.orderBy(col('department').desc_nulls_last(), col('name').desc_nulls_last()).show(truncate = False)

+-----------+-------+-----------+----------+----+---------+----------+-------------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |
+-----------+-------+-----------+----------+----+---------+----------+-------------+
|15         |Mona   |NULL       |52000.5   |26  |true     |2022-08-09|Chicago      |
|14         |Leo    |Engineering|95000.0   |45  |false    |2016-04-18|New York     |
|13         |Karen  |Marketing  |45000.0   |22  |true     |2023-06-15|Atlanta      |
|12         |Jack   |Sales      |1000000.99|99  |true     |2015-12-31|Miami        |
|10         |Ivy    |HR         |63000.3   |36  |NULL     |NULL      |Portland     |
|9          |Henry  |Sales      |59000.9   |NULL|false    |2021-12-01|Denver       |
|8          |Grace  |Marketing  |NULL      |33  |true     |2019-09-08|Austin       |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-08-25|NULL         |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-1

In [30]:
# astype

dataframe.select(col('age').astype('float')).show(truncate = False)

+----+
|age |
+----+
|28.0|
|32.0|
|35.0|
|29.0|
|31.0|
|40.0|
|27.0|
|33.0|
|NULL|
|36.0|
|0.0 |
|99.0|
|22.0|
|45.0|
|26.0|
+----+



In [31]:
# cast

dataframe.select(col('age').cast('float')).show(truncate = False)

+----+
|age |
+----+
|28.0|
|32.0|
|35.0|
|29.0|
|31.0|
|40.0|
|27.0|
|33.0|
|NULL|
|36.0|
|0.0 |
|99.0|
|22.0|
|45.0|
|26.0|
+----+



Note : `cast` and `astype` are same in functionlity

In [33]:
# between

dataframe.select(col('age').between(25,40).alias('ageBTW')).show(truncate = False)

+------+
|ageBTW|
+------+
|true  |
|true  |
|true  |
|true  |
|true  |
|true  |
|true  |
|true  |
|NULL  |
|true  |
|false |
|false |
|false |
|false |
|true  |
+------+



In [35]:
# contains

dataframe.select(col('department').contains('Eng').alias('containSubStr')).show(truncate = False)

+-------------+
|containSubStr|
+-------------+
|true         |
|false        |
|true         |
|false        |
|false        |
|NULL         |
|true         |
|false        |
|false        |
|false        |
|true         |
|false        |
|false        |
|true         |
|NULL         |
+-------------+



In [52]:
# startswith

dataframe.select(col('department').startswith('Mar').alias('containSubStr')).show(truncate = False)

+-------------+
|containSubStr|
+-------------+
|false        |
|true         |
|false        |
|false        |
|false        |
|NULL         |
|false        |
|true         |
|false        |
|false        |
|false        |
|false        |
|true         |
|false        |
|NULL         |
+-------------+



In [36]:
|# endswith

dataframe.select(col('department').endswith('ing').alias('containSubStr')).show(truncate = False)

+-------------+
|containSubStr|
+-------------+
|true         |
|true         |
|true         |
|false        |
|false        |
|NULL         |
|true         |
|true         |
|false        |
|false        |
|true         |
|false        |
|true         |
|true         |
|NULL         |
+-------------+



In [37]:
# eqNullSafe

dataframe.select(col('department').eqNullSafe('Engineering').alias('eq_col')).show(truncate = False)

+------+
|eq_col|
+------+
|true  |
|false |
|true  |
|false |
|false |
|false |
|true  |
|false |
|false |
|false |
|true  |
|false |
|false |
|true  |
|false |
+------+



In [39]:
# ilike

dataframe.select(col('department').ilike('%eng%').alias('ilike_col')).show(truncate = False)

+---------+
|ilike_col|
+---------+
|true     |
|false    |
|true     |
|false    |
|false    |
|NULL     |
|true     |
|false    |
|false    |
|false    |
|true     |
|false    |
|false    |
|true     |
|NULL     |
+---------+



In [50]:
# rlike

dataframe.select(col('department').rlike('%[0-9]%').alias('ilike_col')).show(truncate = False)

+---------+
|ilike_col|
+---------+
|false    |
|false    |
|false    |
|false    |
|false    |
|NULL     |
|false    |
|false    |
|false    |
|false    |
|false    |
|false    |
|false    |
|false    |
|NULL     |
+---------+



In [43]:
# isNotNull

dataframe.select(col('department').isNotNull().alias('IS_notNull')).show(truncate = False)

+----------+
|IS_notNull|
+----------+
|true      |
|true      |
|true      |
|true      |
|true      |
|false     |
|true      |
|true      |
|true      |
|true      |
|true      |
|true      |
|true      |
|true      |
|false     |
+----------+



In [44]:
# isNull

dataframe.select(col('department').isNull().alias('IS_notNull')).show(truncate = False)

+----------+
|IS_notNull|
+----------+
|false     |
|false     |
|false     |
|false     |
|false     |
|true      |
|false     |
|false     |
|false     |
|false     |
|false     |
|false     |
|false     |
|false     |
|true      |
+----------+



In [45]:
# isin

dataframe.select(col('department').isin(lit('Engineering'),lit('Marketing')).alias('isin_col')).show(truncate = False)

+--------+
|isin_col|
+--------+
|true    |
|true    |
|true    |
|false   |
|false   |
|NULL    |
|true    |
|true    |
|false   |
|false   |
|true    |
|false   |
|true    |
|true    |
|NULL    |
+--------+



In [53]:
# when - otherwise

from pyspark.sql.functions import when

dataframe.withColumn('newColumn',when(expr(''' department in ('Engineering') '''),1).otherwise(0)).show(truncate = False)

+-----------+-------+-----------+----------+----+---------+----------+-------------+---------+
|employee_id|name   |department |salary    |age |is_active|hire_date |city         |newColumn|
+-----------+-------+-----------+----------+----+---------+----------+-------------+---------+
|1          |Alice  |Engineering|75000.5   |28  |true     |2020-01-15|New York     |1        |
|2          |Bob    |Marketing  |65000.75  |32  |false    |2019-03-20|San Francisco|0        |
|3          |Charlie|Engineering|82000.25  |35  |true     |2018-07-10|New York     |1        |
|4          |Diana  |Sales      |58000.0   |29  |true     |2021-05-05|Chicago      |0        |
|5          |Eve    |HR         |62000.8   |31  |false    |2020-11-30|Boston       |0        |
|6          |Frank  |NULL       |71000.4   |40  |true     |2017-08-25|NULL         |0        |
|7          |NULL   |Engineering|68000.6   |27  |false    |2022-02-14|Seattle      |1        |
|8          |Grace  |Marketing  |NULL      |33  |t