<a href="https://colab.research.google.com/github/rahulrajpr/prepare-anytime/blob/main/spark/functions/4_spark_sql_type_casting_and_conversion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spark Casting Functions
https://spark.apache.org/docs/latest/sql-ref-functions-builtin.html#conversion-functions

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark-casting').getOrCreate()

In [None]:
# using the cast function

sql = '''select cast('109' as int) as castedValue'''
spark.sql(sql).show(truncate = False)

+-----------+
|castedValue|
+-----------+
|109        |
+-----------+



In [None]:
sql = '''select cast(100 as string) as castedValue''' ## mostly useful without length restriction
spark.sql(sql).show(truncate = False)

sql = '''select cast(100 as char(15)) as castedValue''' ## char only supports when the length is specified
spark.sql(sql).show(truncate = False)

sql = '''select cast(100 as varchar(10)) as castedValue'''  ## varchar only supports when the length is specified
spark.sql(sql).show(truncate = False)


+-----------+
|castedValue|
+-----------+
|100        |
+-----------+

+-----------+
|castedValue|
+-----------+
|100        |
+-----------+

+-----------+
|castedValue|
+-----------+
|100        |
+-----------+



# Supported String Types in Spark SQL

| Type | Length Required | Notes / Usage |
|------|----------------|---------------|
| `STRING` | No | Standard text type, safest choice, works without specifying length |
| `CHAR(n)` | Yes | Fixed-length string. Pads/truncates to the specified length `n` |
| `VARCHAR(n)` | Yes | Variable-length string, max length = `n`. Most use cases prefer `STRING` |

# Other Specific Casting Functions

| Function | Description |
|----------|-------------|
| `tinyint(expr)` | Casts the value `expr` to the target data type `tinyint` |
| `smallint(expr)` | Casts the value `expr` to the target data type `smallint` |
|  |  |
| `int(expr)` | Casts the value `expr` to the target data type `int` |
| `bigint(expr)` | Casts the value `expr` to the target data type `bigint` |
|  |  |
| `binary(expr)` | Casts the value `expr` to the target data type `binary` |
|  |  |
| `boolean(expr)` | Casts the value `expr` to the target data type `boolean` |
|  |  |
| `date(expr)` | Casts the value `expr` to the target data type `date` |
|  |  |
| `decimal(expr)` | Casts the value `expr` to the target data type `decimal` |
| `double(expr)` | Casts the value `expr` to the target data type `double` |
| `float(expr)` | Casts the value `expr` to the target data type `float` |
|  |  |
| `string(expr)` | Casts the value `expr` to the target data type `string` |
|  |  |
| `timestamp(expr)` | Casts the value `expr` to the target data type `timestamp` |

In [None]:
# tinyint

# Minimum value: -128
# Maximum value: 127

sql = '''select tinyint('6') as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select tinyint('200') as castedOut''' ## in this case, the conversion will return a NULL, as the lmit exceeded
spark.sql(sql).show(truncate = False)

+---------+
|castedOut|
+---------+
|6        |
+---------+

+---------+
|castedOut|
+---------+
|NULL     |
+---------+



In [None]:
# smallint

# Minimum value: -32,768
# Maximum value: 32,767

sql = '''select smallint('6') as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select smallint('35,000') as castedOut''' ## in this case, the conversion will return a NULL, as the lmit exceeded
spark.sql(sql).show(truncate = False)

+---------+
|castedOut|
+---------+
|6        |
+---------+

+---------+
|castedOut|
+---------+
|NULL     |
+---------+



In [None]:
# int

# Minimum value: -2,147,483,648
# Maximum value: 2,147,483,647

sql = '''select int(10.5) as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select int('2,222,000,000') as castedOut''' ## in this case, the conversion will return a NULL, as the lmit exceeded
spark.sql(sql).show(truncate = False)

+---------+
|castedOut|
+---------+
|10       |
+---------+

+---------+
|castedOut|
+---------+
|NULL     |
+---------+



In [None]:
# bigint

# Minimum value: -9,223,372,036,854,775,808
# Maximum value: 9,223,372,036,854,775,807

sql = '''select bigint('9000000000000') as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select bigint('9,000,000,000,000,000,000') as castedOut'''
spark.sql(sql).show(truncate = False)

+-------------+
|castedOut    |
+-------------+
|9000000000000|
+-------------+

+---------+
|castedOut|
+---------+
|NULL     |
+---------+



In [None]:
# binary

sql = '''select binary('rahul') as castedOut'''
spark.sql(sql).show(truncate = False)

+----------------+
|castedOut       |
+----------------+
|[72 61 68 75 6C]|
+----------------+



In [None]:
# boolean

sql = '''select '1' as val,boolean('1') as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select 1 as val,boolean(1) as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select '0' as val,boolean('0') as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select 0 as val,boolean(0) as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select '-1' as val,boolean('-1') as castedOut''' ## string negative value
spark.sql(sql).show(truncate = False)

sql = '''select -1 as val,boolean(-1) as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select True as val,boolean(True) as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select 'True' as val,boolean('True') as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select 'true' as val,boolean('true') as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select 'T' as val,boolean('T') as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select 't' as val,boolean('t') as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select 100 as val,boolean(100) as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select 'rahul' as val,boolean('rahul') as castedOut'''
spark.sql(sql).show(truncate = False)

sql = '''select NULL as val,boolean(NULL) as castedOut'''
spark.sql(sql).show(truncate = False)

+---+---------+
|val|castedOut|
+---+---------+
|1  |true     |
+---+---------+

+---+---------+
|val|castedOut|
+---+---------+
|1  |true     |
+---+---------+

+---+---------+
|val|castedOut|
+---+---------+
|0  |false    |
+---+---------+

+---+---------+
|val|castedOut|
+---+---------+
|0  |false    |
+---+---------+

+---+---------+
|val|castedOut|
+---+---------+
|-1 |NULL     |
+---+---------+

+---+---------+
|val|castedOut|
+---+---------+
|-1 |true     |
+---+---------+

+----+---------+
|val |castedOut|
+----+---------+
|true|true     |
+----+---------+

+----+---------+
|val |castedOut|
+----+---------+
|True|true     |
+----+---------+

+----+---------+
|val |castedOut|
+----+---------+
|true|true     |
+----+---------+

+---+---------+
|val|castedOut|
+---+---------+
|T  |true     |
+---+---------+

+---+---------+
|val|castedOut|
+---+---------+
|t  |true     |
+---+---------+

+---+---------+
|val|castedOut|
+---+---------+
|100|true     |
+---+---------+

+-----+------

In [None]:
# date

sql = '''select date('2025-10-25') as castedOut'''  ## only supports ISO Date Formatting , best case use to_date() format
spark.sql(sql).show(truncate = False)

+----------+
|castedOut |
+----------+
|2025-10-25|
+----------+



# Spark SQL Date Parsing Notes

## Understanding Date Parsing Behavior

Spark SQL has two main approaches to parsing dates:
1. **Default parsing** with `DATE()` or `CAST AS DATE` - limited format support
2. **Custom format parsing** with `TO_DATE(expr, format)` - flexible format support

---

## 1. Default `DATE()` / `CAST(... AS DATE)` Behavior

### Reliably Supported Format

| Pattern | Example | Notes |
|---------|---------|-------|
| `yyyy-MM-dd` | `2025-10-25` | Standard ISO format - **only reliably supported format** |
| `yyyy-MM-dd HH:mm:ss` | `2025-10-25 14:30:00` | Time part ignored for `DATE()` |
| `yyyy-MM-ddTHH:mm:ss` | `2025-10-25T14:30:00` | ISO 8601 format |
| `yyyy-MM-dd HH:mm:ss.SSS` | `2025-10-25 14:30:00.123` | Milliseconds ignored |

> **Critical Rule:** Default `DATE()` and `CAST AS DATE` primarily support `yyyy-MM-dd` format. Other formats may return NULL or require `TO_DATE()` with explicit format specification.


## 2 Using `TO_DATE(string, format)` for Custom Patterns

Use `TO_DATE()` when the date string is not in ISO `yyyy-MM-dd` format or contains month names.

| Format String | Example Input | Example Query |
|---------------|---------------|---------------|
| `yyyyMMdd` | `20251025` | `SELECT TO_DATE('20251025', 'yyyyMMdd');` |
| `yyyy-MMM-dd` | `2025-Oct-25` | `SELECT TO_DATE('2025-Oct-25', 'yyyy-MMM-dd');` |
| `dd-MM-yyyy` | `25-10-2025` | `SELECT TO_DATE('25-10-2025', 'dd-MM-yyyy');` |
| `dd/MM/yyyy` | `25/10/2025` | `SELECT TO_DATE('25/10/2025', 'dd/MM/yyyy');` |
| `MMM dd, yyyy` | `Oct 25, 2025` | `SELECT TO_DATE('Oct 25, 2025', 'MMM dd, yyyy');` |
| `dd MMM yyyy` | `25 Oct 2025` | `SELECT TO_DATE('25 Oct 2025', 'dd MMM yyyy');` |
| `MM/dd/yyyy` | `10/25/2025` | `SELECT TO_DATE('10/25/2025', 'MM/dd/yyyy');` |

> **Important:** Pattern letters are case-sensitive. Use lowercase `d` for day-of-month, lowercase `yyyy` for year, and `MM` for month number. Use `MMM` for abbreviated month names.

---

## 3 Key Notes

1. **Default `DATE()` support is limited** - Only `yyyy-MM-dd` ISO format is reliably auto-cast
2. **Non-ISO formats require `TO_DATE()`** - Use format string parameter for custom patterns
3. **Month names always require `TO_DATE()`** - No month name parsing in default `DATE()`
4. **Time parts are ignored** - `DATE()` extracts only the date portion from timestamps
5. **Invalid strings return NULL** - Both `DATE()` and `TO_DATE()` return NULL for unparseable strings
6. **Pattern syntax follows Java DateTimeFormatter** - Refer to Spark documentation for all pattern letters

---

## 4.Examples
```sql
-- ✅ Default DATE() works - ISO format
SELECT DATE('2025-10-25') AS result;  -- 2025-10-25
SELECT DATE('2025-10-25 14:30:00') AS result;  -- 2025-10-25

-- ✅ Custom TO_DATE() works for all formats
SELECT TO_DATE('20251025', 'yyyyMMdd') AS result;  -- 2025-10-25
SELECT TO_DATE('2025-Oct-25', 'yyyy-MMM-dd') AS result;  -- 2025-10-25
SELECT TO_DATE('25-10-2025', 'dd-MM-yyyy') AS result;  -- 2025-10-25
SELECT TO_DATE('Oct 25, 2025', 'MMM dd, yyyy') AS result;  -- 2025-10-25
SELECT TO_DATE('10/25/2025', 'MM/dd/yyyy') AS result;  -- 2025-10-25
```

---

## 5 Best Practices

1. **Prefer ISO format** (`yyyy-MM-dd`) when possible - no parsing needed
2. **Always use `TO_DATE()` with format** for non-ISO strings
3. **Test your date patterns** - pattern letters are case-sensitive
4. **Handle NULLs gracefully** - invalid dates will return NULL
5. **Document date formats** in your data pipelines for maintainability

In [None]:
# decimal

sql = '''select decimal(123.56) as castedOut''' ## default (10,2) decimal
spark.sql(sql).show(truncate = False)

## -- correct usage with a cast function

sql = '''select cast(123.56 as decimal(10,3)) castedOut''' ## default (10,2) decimal
spark.sql(sql).show(truncate = False)

+---------+
|castedOut|
+---------+
|124      |
+---------+

+---------+
|castedOut|
+---------+
|123.560  |
+---------+



In [None]:
# float

sql = '''select float(123.56) as castedOut''' ## default (10,2) decimal
spark.sql(sql).show(truncate = False)

+---------+
|castedOut|
+---------+
|123.56   |
+---------+



In [None]:
# double

sql = '''select double(123.56) as castedOut''' ## default (10,2) decimal
spark.sql(sql).show(truncate = False)

+---------+
|castedOut|
+---------+
|123.56   |
+---------+



#### Quick Comparison

| Feature | FLOAT | DOUBLE | DECIMAL |
|---------|-------|--------|---------|
| **Precision** | 32-bit (~7 digits) | 64-bit (~15-16 digits) | Up to 38 digits |
| **Storage** | 4 bytes | 8 bytes | Variable |
| **Arithmetic** | Approximate | Approximate | Exact |
| **Rounding Errors** | Yes | Yes | No |
| **Performance** | Fastest | Fast | Slower |
| **Best For** | Scientific calculations, ML | General floating-point | Financial, monetary |

#### Key Rules

1. **Financial/Money/Currency data → Always use DECIMAL**
2. **Scientific/ML data → Use FLOAT or DOUBLE**


In [None]:
# string

sql = '''select string(123.56) as castedOut'''
spark.sql(sql).show(truncate = False)

+---------+
|castedOut|
+---------+
|123.56   |
+---------+



In [None]:
# timestamp

sql = '''select timestamp('2025-10-25 12:12:12') as castedOut'''
spark.sql(sql).show(truncate = False)


+-------------------+
|castedOut          |
+-------------------+
|2025-10-25 12:12:12|
+-------------------+



# Type Conversion Functions (to_ Functions) in Spark SQL

| Function | Converts From | Converts To | Description |
|----------|---------------|-------------|-------------|
| `to_char(expr, format)` | Date / Timestamp | String | Converts a date or timestamp to a formatted string |
| `to_varchar(expr, format)` | Any | Varchar | Converts any data type to varchar with specified maximum length |
|  |  |  |
| `to_number(expr,format)` | String / Numeric | Numeric | Converts a string or numeric value to a numeric type |
|  |  |  |
| `to_date(expr, format)` | String / Timestamp | Date | Converts a string or timestamp to a date using the specified format |
| `to_timestamp(expr, format)` | String | Timestamp | Converts a string to a timestamp using the specified format |
|  |  |  |
| `to_json(expr)` | Struct / Map / Array | String (JSON) | Converts complex data structures to JSON string format |
|  |  |  |
| `to_binary(expr)` | String | Binary | Converts a string to binary data |

In [None]:
# to_char

sql = '''SELECT to_char(167, '999.99') AS castedOut''' ## requires a formatting pattern
spark.sql(sql).show(truncate=False)

+---------+
|castedOut|
+---------+
|167.00   |
+---------+



In [None]:
# to_varchar(expr, format)

sql = '''SELECT to_varchar(12345,'99999.99') AS castedOut''' ## requires a formatting pattern
spark.sql(sql).show(truncate=False)

+---------+
|castedOut|
+---------+
|12345.00 |
+---------+



In [None]:
# to_number(expr)

sql = '''SELECT TO_NUMBER('12345', '99999') AS castedOut''' ## requires a formatting pattern
spark.sql(sql).show(truncate=False)


+---------+
|castedOut|
+---------+
|12345    |
+---------+



In [None]:
# to_date(expr, format) : returning a date object

## the pattern is the pattern of the datestring within the input column, it is NOT the output pattern

sql = '''SELECT to_date('25-10-2025', 'dd-MM-yyyy') AS castedOut'''
spark.sql(sql).show(truncate=False)

sql = '''SELECT to_date('10/25/2025', 'MM/dd/yyyy') AS castedOut'''
spark.sql(sql).show(truncate=False)

sql = '''SELECT to_date('Oct 25, 2025', 'MMM dd, yyyy') AS castedOut'''
spark.sql(sql).show(truncate=False)

sql = '''SELECT to_date(timestamp('2025-10-25 12:30:45')) AS castedOut'''
spark.sql(sql).show(truncate=False)


+----------+
|castedOut |
+----------+
|2025-10-25|
+----------+

+----------+
|castedOut |
+----------+
|2025-10-25|
+----------+

+----------+
|castedOut |
+----------+
|2025-10-25|
+----------+

+----------+
|castedOut |
+----------+
|2025-10-25|
+----------+



In [None]:
# to_timestamp(expr, format) : returning a datetime object

## the pattern is the pattern of the datestring within the input column, it is NOT the output pattern

sql = '''SELECT to_timestamp('2025-10-25 12:30:45', 'yyyy-MM-dd HH:mm:ss') AS castedOut'''
spark.sql(sql).show(truncate=False)

sql = '''SELECT to_timestamp('10/25/2025 14:30:00', 'MM/dd/yyyy HH:mm:ss') AS castedOut'''
spark.sql(sql).show(truncate=False)

sql = '''SELECT to_timestamp('2025-10-25 12:30:45.123', 'yyyy-MM-dd HH:mm:ss.SSS') AS castedOut'''
spark.sql(sql).show(truncate=False)

sql = '''SELECT to_timestamp('2025-10-25 12:30:45') AS castedOut'''
spark.sql(sql).show(truncate=False)

+-------------------+
|castedOut          |
+-------------------+
|2025-10-25 12:30:45|
+-------------------+

+-------------------+
|castedOut          |
+-------------------+
|2025-10-25 14:30:00|
+-------------------+

+-----------------------+
|castedOut              |
+-----------------------+
|2025-10-25 12:30:45.123|
+-----------------------+

+-------------------+
|castedOut          |
+-------------------+
|2025-10-25 12:30:45|
+-------------------+



In [None]:
# to_json (from a map)

sql = '''
with cte as
(
select map('name','rahul','interest','data engineering') as inp
)
select
  inp,
  typeOf(inp) as inpType,
  to_json(inp) as out,
  typeOf(to_json(inp)) as outType
from cte
'''
spark.sql(sql).show(truncate = False)

+---------------------------------------------+------------------+----------------------------------------------+-------+
|inp                                          |inpType           |out                                           |outType|
+---------------------------------------------+------------------+----------------------------------------------+-------+
|{name -> rahul, interest -> data engineering}|map<string,string>|{"name":"rahul","interest":"data engineering"}|string |
+---------------------------------------------+------------------+----------------------------------------------+-------+



In [None]:
# to_json (from an named_struct)

sql = '''
with cte as
(
select named_struct('name','rahul','interest','data engineering') as inp
)
select
  inp,
  typeOf(inp) as inpType,
  to_json(inp) as out,
  typeOf(to_json(inp)) as outType
from cte
'''
spark.sql(sql).show(truncate = False)

+-------------------------+-----------------------------------+----------------------------------------------+-------+
|inp                      |inpType                            |out                                           |outType|
+-------------------------+-----------------------------------+----------------------------------------------+-------+
|{rahul, data engineering}|struct<name:string,interest:string>|{"name":"rahul","interest":"data engineering"}|string |
+-------------------------+-----------------------------------+----------------------------------------------+-------+



In [None]:
# to_json (from an named_struct)

sql = '''
with cte as
(
select array(named_struct('name','rahul','interest','data engineering'),
             named_struct('name','meghana','interest','data analysis')) as inp
)
select
  inp,
  typeOf(inp) as inpType,
  to_json(inp) as out,
  typeOf(to_json(inp)) as outType
from cte
'''
spark.sql(sql).show(truncate = False)

+-----------------------------------------------------+------------------------------------------+----------------------------------------------------------------------------------------------+-------+
|inp                                                  |inpType                                   |out                                                                                           |outType|
+-----------------------------------------------------+------------------------------------------+----------------------------------------------------------------------------------------------+-------+
|[{rahul, data engineering}, {meghana, data analysis}]|array<struct<name:string,interest:string>>|[{"name":"rahul","interest":"data engineering"},{"name":"meghana","interest":"data analysis"}]|string |
+-----------------------------------------------------+------------------------------------------+----------------------------------------------------------------------------------------------

In [None]:
# to_binary

sql = '''SELECT to_binary(167) AS castedOut''' ## requires a formatting pattern
spark.sql(sql).show(truncate=False)

+---------+
|castedOut|
+---------+
|[01 67]  |
+---------+

