<a href="https://colab.research.google.com/github/rahulrajpr/prepare-anytime/blob/main/spark/functions/12_spark_sql_predicate_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Spark Predicate Functions**
https://spark.apache.org/docs/latest/sql-ref-functions-builtin.html#predicate-functions

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark-functions').getOrCreate()

In [None]:
# !

sql = '''select ! True as val'''
spark.sql(sql).show(truncate = True)

+-----+
|  val|
+-----+
|false|
+-----+



In [None]:
# NOT

sql = '''select NOT True as val'''
spark.sql(sql).show(truncate = True)

+-----+
|  val|
+-----+
|false|
+-----+



In [None]:
# =
# Synonym ==

sql = '''select 5 = 5.0 as val'''
spark.sql(sql).show(truncate = True)

#--

sql = '''select NULL = NULL as val'''
spark.sql(sql).show(truncate = True)

#--

sql = '''select 3 = NULL as val'''
spark.sql(sql).show(truncate = True)

+----+
| val|
+----+
|true|
+----+

+----+
| val|
+----+
|NULL|
+----+

+----+
| val|
+----+
|NULL|
+----+



In [None]:
# ==

sql = '''select 5 == 5.0 as val'''
spark.sql(sql).show(truncate = True)

#--

sql = '''select NULL == NULL as val'''
spark.sql(sql).show(truncate = True)

#--

sql = '''select 3 == NULL as val'''
spark.sql(sql).show(truncate = True)

+----+
| val|
+----+
|true|
+----+

+----+
| val|
+----+
|NULL|
+----+

+----+
| val|
+----+
|NULL|
+----+



In [None]:
# <=> : Null Safe Equality
# synonymn : EqualNull

sql = '''select 5 <=> 5.0 as val'''
spark.sql(sql).show(truncate = True)

#--

sql = '''select NULL <=> NULL as val'''
spark.sql(sql).show(truncate = True)

#-

sql = '''select 4 <=> NULL as val'''
spark.sql(sql).show(truncate = True)

+----+
| val|
+----+
|true|
+----+

+----+
| val|
+----+
|true|
+----+

+-----+
|  val|
+-----+
|false|
+-----+



In [None]:
# EqualNUll

sql = '''select Equal_Null(5,5.0) as val'''
spark.sql(sql).show(truncate = True)

#--

sql = '''select Equal_Null(NULL,NULL)  as val'''
spark.sql(sql).show(truncate = True)

#-

sql = '''select Equal_Null(5,NULL) as val'''
spark.sql(sql).show(truncate = True)

+----+
| val|
+----+
|true|
+----+

+----+
| val|
+----+
|true|
+----+

+-----+
|  val|
+-----+
|false|
+-----+



---
#### Spark Predicate Comparison: `=` vs `<=>` (Null Safe Equality)
---

| Aspect | `=` (Equality) | `<=>` (Null-safe Equality) |
|--------|----------------|----------------------------|
| **Basic Definition** | Standard equality comparison | Null-safe equality comparison |
| **NULL Handling** | Returns NULL if any operand is NULL | Returns TRUE if both operands are NULL, FALSE if one is NULL |
| **Syntax** | `column = value` | `column <=> value` |
| **Alternative Syntax** | `==` (in DataFrame API) | `eqNullSafe()` (in DataFrame API) |
| **NULL = NULL** | Returns NULL | Returns TRUE |
| **NULL = Value** | Returns NULL | Returns FALSE |
| **Value = NULL** | Returns NULL | Returns FALSE |
| **Spark SQL Support** | Yes | Yes |
| **DataFrame API Support** | Yes | Yes |
---
##### Key Differences
---
- **`=`** follows SQL standard: any comparison with NULL returns NULL
- **`<=>`** treats NULL as a comparable value: NULL <=> NULL returns TRUE
---
##### When to Use
---
##### Use `=` when:
- You're sure columns don't contain NULLs
- You want standard SQL behavior
- NULL comparisons should return NULL
- Performance is critical
---
##### Use `<=>` when:
---
- You need to compare columns that may contain NULLs
- You want to treat NULL as a comparable value
- You need consistent boolean results
- Working with anti-joins or finding missing records

In [None]:
# <

sql = '''select 5 < 10 as val'''
spark.sql(sql).show(truncate = True)

+----+
| val|
+----+
|true|
+----+



In [None]:
# <=

sql = '''select 5 <= 5.0 as val'''
spark.sql(sql).show(truncate = True)

+----+
| val|
+----+
|true|
+----+



In [None]:
# <

sql = '''select 10 > 5 as val'''
spark.sql(sql).show(truncate = True)

+----+
| val|
+----+
|true|
+----+



In [None]:
# <=

sql = '''select 10 >= 5.0 as val'''
spark.sql(sql).show(truncate = True)

+----+
| val|
+----+
|true|
+----+



In [None]:
# and : `loccal and`

sql = '''select True and True as val'''
spark.sql(sql).show(truncate = True)

sql = '''select False and True as val'''
spark.sql(sql).show(truncate = True)

sql = '''select NULL and True as val'''  #-- very important to note
spark.sql(sql).show(truncate = True)

sql = '''select NULL and False as val''' #-- very important to note
spark.sql(sql).show(truncate = True)


+----+
| val|
+----+
|true|
+----+

+-----+
|  val|
+-----+
|false|
+-----+

+----+
| val|
+----+
|NULL|
+----+

+-----+
|  val|
+-----+
|false|
+-----+



In [None]:
# or : `locical or`

sql = '''select True or True as val'''
spark.sql(sql).show(truncate = True)

sql = '''select False or True as val'''
spark.sql(sql).show(truncate = True)

sql = '''select NULL or True as val'''  #-- very important to note
spark.sql(sql).show(truncate = True)

sql = '''select NULL or False as val''' #-- very important to note
spark.sql(sql).show(truncate = True)

+----+
| val|
+----+
|true|
+----+

+----+
| val|
+----+
|true|
+----+

+----+
| val|
+----+
|true|
+----+

+----+
| val|
+----+
|NULL|
+----+



---
#### Logical AND / OR Truth Table (with NULL)
---
| Operand1 | Operand2 | AND Result | OR Result |
|----------|----------|------------|-----------|
| TRUE     | TRUE     | TRUE       | TRUE      |
| TRUE     | FALSE    | FALSE      | TRUE      |
|||
| FALSE    | FALSE    | FALSE      | FALSE     |
|||
| TRUE     | NULL     | NULL       | TRUE      |
|||
| FALSE    | NULL     | FALSE      | NULL      |
|||
| NULL     | NULL     | NULL       | NULL      |

---
##### Key Rules
---

##### AND (&&)
- Returns **TRUE** only if both operands are TRUE
- Returns **FALSE** immediately if any operand is FALSE
- Returns **NULL** if no operand is FALSE and at least one is NULL
---
##### OR (||)
- Returns **TRUE** immediately if any operand is TRUE  
- Returns **FALSE** only if both operands are FALSE
- Returns **NULL** if no operand is TRUE and at least one is NULL
---
##### Short-Circuit Evaluation
- `FALSE AND anything` = FALSE (right side not evaluated)
- `TRUE OR anything` = TRUE (right side not evaluated)

In [None]:
# like

sql = '''select 'rahul' like '%hu%' as val '''
spark.sql(sql).show(truncate = False)

+----+
|val |
+----+
|true|
+----+



In [None]:
# ilike : case in-sensitive like

sql = '''select 'rahul' ilike '%HU%' as val '''
spark.sql(sql).show(truncate = False)

+----+
|val |
+----+
|true|
+----+



In [None]:
# rlike : regex-like

sql = '''select 'rahul99' rlike '[0-9]' as val '''
spark.sql(sql).show(truncate = False)

+----+
|val |
+----+
|true|
+----+



In [None]:
# regexp_like : regexp-like

sql = '''select regexp_like('rahul99','[0-9]') as val '''
spark.sql(sql).show(truncate = False)

+----+
|val |
+----+
|true|
+----+



#### Comparison of `LIKE`, `ILIKE`, and `RLIKE`

| Feature | LIKE | ILIKE | RLIKE (or REGEXP) |
| :--- | :--- | :--- | :--- |
| **Full Name** | Basic Pattern Match | **Case-Insensitive** Basic Pattern Match | **Regex** Pattern Match |
| **Case-Sensitive?** | **Yes** | **No** | **Yes** (by default) |
| **Pattern Type** | SQL Wildcards | SQL Wildcards | **POSIX Regular
| **Performance** | **Fastest** (simple comparison) | Fast (similar to LIKE) | **Slower** (regex engine overhead) |
| **Use Case** | Simple, case-sensitive text filtering.<br>e.g., Find names starting with "Jo". | Simple, case-**in**sensitive text filtering.<br>e.g., Find emails ending with "@gmail.com". | Complex, flexible pattern matching.<br>e.g., Validate email format, extract log data. |

In [None]:
# in

sql = '''select 'rahul' in ('rahul','lathika','skylr') as val '''
spark.sql(sql).show(truncate = False)

+----+
|val |
+----+
|true|
+----+



In [None]:
# isnull

sql = '''select isnull(NULL) as val '''
spark.sql(sql).show(truncate = False)

+----+
|val |
+----+
|true|
+----+



In [None]:
# isNotnull

sql = '''select isNotnull('rahul') as val '''
spark.sql(sql).show(truncate = False)

+----+
|val |
+----+
|true|
+----+



In [None]:
# isNan

sql = '''select 99 AS val,isNan(99) as Eval '''
spark.sql(sql).show(truncate = False)

sql = '''select 'rahul' AS val,isNan('rahul') as Eval '''
spark.sql(sql).show(truncate = False)

sql = '''select NULL AS val,isNan(NULL) as Eval '''
spark.sql(sql).show(truncate = False)

sql = '''select 'Nan' AS val,isNan('Nan') as Eval '''
spark.sql(sql).show(truncate = False)

sql = '''select CAST('Nan' AS DOUBLE) as val,isNan(CAST('Nan' AS DOUBLE)) as Eval '''
spark.sql(sql).show(truncate = False)

+---+-----+
|val|Eval |
+---+-----+
|99 |false|
+---+-----+

+-----+-----+
|val  |Eval |
+-----+-----+
|rahul|false|
+-----+-----+

+----+-----+
|val |Eval |
+----+-----+
|NULL|false|
+----+-----+

+---+----+
|val|Eval|
+---+----+
|Nan|true|
+---+----+

+---+----+
|val|Eval|
+---+----+
|NaN|true|
+---+----+



---
#### Comparison of NULL and NaN
---

| Feature | NULL | NaN (Not a Number) |
| :--- | :--- | :--- |
| **Full Name** | Absence of Value / Unknown | **Not a Number** |
| **Data Type** | Applicable to **all** data types | Primarily for **floating-point** numeric types (Float, Double) |
| **Semantic Meaning** | Represents **missing**, **unknown**, or **inapplicable** data | Represents an **undefined** or **unrepresentable** numeric value |
| **Origin/Source** | Unpopulated fields, JOIN mismatches, explicit NULL inserts | Result of **invalid mathematical operations** (0/0, ∞-∞, √-1) |
| **Comparison Behavior** | `NULL = NULL` → **NULL**<br>`NULL IS NULL` → **True**<br>| `NaN = NaN` → **True**<br>`NaN` is considered **equal to itself** |
| **Propagation** | Most operations with NULL return **NULL** (propagates) | Mathematical operations with NaN typically return **NaN** (propagates) |
| **Sorting Order** | Treated as the **smallest** possible value by default<br>NULLs appear **first** in ascending order | Considered **larger** than any other numeric value<br>NaNs appear **after** NULLs and regular numbers |
---
## Quick Summary
---
- **`NULL`**: Represents **missing or unknown data** across all data types
- **`NaN`**: Represents an **invalid numeric result** specifically in floating-point computations
---
## Key Behavioral Differences
---
```sql
-- NULL comparisons
SELECT NULL = NULL;        -- → NULL (not True!)
SELECT NULL IS NULL;       -- → True
SELECT NULL IS NOT NULL;   -- → False

-- NaN comparisons  
SELECT NaN = NaN;          -- → True (unlike NULL!)
SELECT NaN IS NULL;        -- → False
SELECT isnan(NaN);         -- → True
SELECT isnan(NULL);        -- → False