<a href="https://colab.research.google.com/github/rahulrajpr/prepare-anytime/blob/main/spark/functions/1_spark_sql_string_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spark String Functions
https://spark.apache.org/docs/latest/sql-ref-functions-builtin.html#string-functions

In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('sql_functions').getOrCreate()

In [None]:
sql = 'select 1 as col'
spark.sql(sql).show(truncate = False)

+---+
|col|
+---+
|1  |
+---+



In [None]:
## BTRIM (alias for trim)

sql = '''SELECT BTRIM(' Rahul') AS trimmedString'''

spark.sql(sql).show(truncate = False)

+-------------+
|trimmedString|
+-------------+
|Rahul        |
+-------------+



In [None]:
## trim : (alias for btrim)

sql = '''SELECT trim(' Rahul') AS trimmedString'''

spark.sql(sql).show(truncate = False)

+-------------+
|trimmedString|
+-------------+
|Rahul        |
+-------------+



In [None]:
## ltrim : (trim to the left)

sql = '''SELECT ltrim(' Rahul') AS trimmedString'''

spark.sql(sql).show(truncate = False)

+-------------+
|trimmedString|
+-------------+
|Rahul        |
+-------------+



In [None]:
## rtrim : (trim to the right)

sql = '''SELECT rtrim('Rahul ') AS trimmedString'''

spark.sql(sql).show(truncate = False)

+-------------+
|trimmedString|
+-------------+
|Rahul        |
+-------------+



In [None]:
## char_length : return the length of the charactor, synonym for len

sql = '''SELECT char_length('Rahul') AS lengthChar'''

spark.sql(sql).show(truncate = False)

+----------+
|lengthChar|
+----------+
|5         |
+----------+



In [None]:
## len : return the length of the charactor, synonym for char_length

sql = '''SELECT len('Rahul') AS lengthChar'''

spark.sql(sql).show(truncate = False)

+----------+
|lengthChar|
+----------+
|5         |
+----------+



In [None]:
## length : return the length of the charactor, synonym for len

sql = '''SELECT length('Rahul') AS lengthChar'''

spark.sql(sql).show(truncate = False)

+----------+
|lengthChar|
+----------+
|5         |
+----------+



In [None]:
## concat_ws : concatinating different columns with specified delimeter

sql = '''SELECT concat_ws('-','Rahul','Raj') AS concat_ws_output'''

spark.sql(sql).show(truncate = False)

+----------------+
|concat_ws_output|
+----------------+
|Rahul-Raj       |
+----------------+



In [None]:
## contains : check if a substring is conatined within a master string and return True or False

sql = '''SELECT contains('Rahul Raj','Raj') AS isContains'''

spark.sql(sql).show(truncate = False)

+----------+
|isContains|
+----------+
|true      |
+----------+



In [None]:
## elt : return the value from the list of volumns by specified column index

sql = '''SELECT elt(2,'Rahul','Skylr','lathika') AS elt_output'''

spark.sql(sql).show(truncate = False)

+----------+
|elt_output|
+----------+
|Skylr     |
+----------+



In [None]:
## startswith : startswith checks a string string starts with a sub string

sql = '''SELECT startswith('Bangalore','Ba') AS isStartsWith'''

spark.sql(sql).show(truncate = False)

+------------+
|isStartsWith|
+------------+
|true        |
+------------+



In [None]:
## endswith : checks if the string endwith a particular sub-string and return True or False

sql = '''SELECT endswith('Bangalore','ore') AS isEndsWith'''

spark.sql(sql).show(truncate = False)

+----------+
|isEndsWith|
+----------+
|true      |
+----------+



In [None]:
## find_in_set : return 1 in case as string in part of the array of strings

sql = '''SELECT find_in_set('rahul','rahul,lathika,skylr') AS inSet'''

spark.sql(sql).show(truncate = False)

+-----+
|inSet|
+-----+
|1    |
+-----+



In [None]:
## format_number : return the striglified version of a numeber based on the format specified
## this return a s string

sql = '''SELECT format_number(123,'000') AS formatted_numeber'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT format_number(123,'0000') AS formatted_numeber'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT format_number(123,'000.00') AS formatted_numeber'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT format_number(123,'$000.00') AS formatted_numeber'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT format_number(12345,'0,000') AS formatted_numeber'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT format_number(.84,'0.00%') AS formatted_numeber'''
spark.sql(sql).show(truncate = False)


+-----------------+
|formatted_numeber|
+-----------------+
|123              |
+-----------------+

+-----------------+
|formatted_numeber|
+-----------------+
|0123             |
+-----------------+

+-----------------+
|formatted_numeber|
+-----------------+
|123.00           |
+-----------------+

+-----------------+
|formatted_numeber|
+-----------------+
|$123.00          |
+-----------------+

+-----------------+
|formatted_numeber|
+-----------------+
|12,345           |
+-----------------+

+-----------------+
|formatted_numeber|
+-----------------+
|84.00%           |
+-----------------+



In [None]:
## to_number : return the striglified version of a numeber based on the format specified
## this return a numeric datatype

sql = '''SELECT to_number(123,'999') AS to_numbetOut'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT to_number(123,'9999') AS to_numbetOut'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT to_number(123,'999.99') AS to_numbetOut'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT to_number(123,'999.999') AS to_numbetOut'''
spark.sql(sql).show(truncate = False)

sql = '''
WITH CTE AS
(
SELECT to_number(123,'999') AS to_numbetOut
)
SELECT TypeOf(to_numbetOut) as typeOut
FROM CTE
'''
spark.sql(sql).show(truncate = False)


+------------+
|to_numbetOut|
+------------+
|123         |
+------------+

+------------+
|to_numbetOut|
+------------+
|123         |
+------------+

+------------+
|to_numbetOut|
+------------+
|123.00      |
+------------+

+------------+
|to_numbetOut|
+------------+
|123.000     |
+------------+

+------------+
|typeOut     |
+------------+
|decimal(3,0)|
+------------+



In [None]:
## string_format : equvalent to the f-string in python

# %s - for string replacement
# %d - for integer replacement
# %f - float replacemnt

sql = '''SELECT format_string('I %s live in %s and i work at %s','rahul','bengaluru','examroom.ai') AS formatted_numeber'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT format_string('Sanju Scored %d runs today',145) AS formatted_numeber'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT format_string('My wieght is %.2f kg',CAST(69.5 AS DOUBLE)) AS formatted_numeber'''
spark.sql(sql).show(truncate = False)

+---------------------------------------------------+
|formatted_numeber                                  |
+---------------------------------------------------+
|I rahul live in bengaluru and i work at examroom.ai|
+---------------------------------------------------+

+---------------------------+
|formatted_numeber          |
+---------------------------+
|Sanju Scored 145 runs today|
+---------------------------+

+---------------------+
|formatted_numeber    |
+---------------------+
|My wieght is 69.50 kg|
+---------------------+



In [None]:
## to_char : convert a string to given string pattern and numbers
## to_varchar : a syonym and with a fixed length for the output

sql = '''select to_char(123,'9999.99') as to_charOut'''
spark.sql(sql).show(truncate = False)

sql = '''select to_varchar(123,'9999.99') as to_charOut'''
spark.sql(sql).show(truncate = False)

sql = '''select to_varchar(123,'999') as to_charOut'''
spark.sql(sql).show(truncate = False)

+----------+
|to_charOut|
+----------+
| 123.00   |
+----------+

+----------+
|to_charOut|
+----------+
| 123.00   |
+----------+

+----------+
|to_charOut|
+----------+
|123       |
+----------+



In [None]:
## format_date : convert a date to given string pattern and numbers

sql = '''select date_format(to_date('2025-10-25'), 'dd-MMM-yyyy') as to_charOut'''
spark.sql(sql).show(truncate = False)

sql = '''select date_format('2025-10-25', 'dd-MMM-yyyy') as to_charOut'''
spark.sql(sql).show(truncate = False)

sql = '''
with cte as
(
select date_format('2025-10-25', 'dd-MMM-yyyy') as dateFormatOut
)
select typeOf(dateFormatOut) as outputType
from cte
'''
spark.sql(sql).show(truncate = False)

+-----------+
|to_charOut |
+-----------+
|25-Oct-2025|
+-----------+

+-----------+
|to_charOut |
+-----------+
|25-Oct-2025|
+-----------+

+----------+
|outputType|
+----------+
|string    |
+----------+



In [None]:
## initcap : capitaluse the initial letter of a string

sql = '''SELECT INITCAP('rahul') as initcapOut'''
spark.sql(sql).show(truncate = False)

+----------+
|initcapOut|
+----------+
|Rahul     |
+----------+



In [None]:
## lcase : lowercasing the string in full, synonym for lower

sql = '''SELECT lcase('RaHul') as lcaseOut'''
spark.sql(sql).show(truncate = False)

+--------+
|lcaseOut|
+--------+
|rahul   |
+--------+



In [None]:
## lower : lowercasing the string in full, synonym for lcase

sql = '''SELECT lower('RaHul') as lowerOut'''
spark.sql(sql).show(truncate = False)

+--------+
|lowerOut|
+--------+
|rahul   |
+--------+



In [None]:
## ucase : uppercasing the string in full, synonym for upper

sql = '''SELECT ucase('RaHul') as uCaseOut'''
spark.sql(sql).show(truncate = False)

+--------+
|uCaseOut|
+--------+
|RAHUL   |
+--------+



In [None]:
## upper : uppercasing the string in full, synonym for ucase

sql = '''SELECT upper('RaHul') as upperOut'''
spark.sql(sql).show(truncate = False)

+--------+
|upperOut|
+--------+
|RAHUL   |
+--------+



In [None]:
## locate : locate the 1-based potision of a suybstring in a string , Syntax : Substring, String

sql = '''SELECT locate('h','Rahul') AS locationOut'''
spark.sql(sql).show(truncate = False)

+-----------+
|locationOut|
+-----------+
|3          |
+-----------+



In [None]:
## position :  position, the 1-based potision of a substring in a string , Syntax : Substring, String (same asloacte)

sql = '''SELECT position('h','Rahul') AS locationOut'''
spark.sql(sql).show(truncate = False)

+-----------+
|locationOut|
+-----------+
|3          |
+-----------+



In [None]:
## lpad : padding the string to the left with padding number

sql = '''SELECT lpad('Rahul',10,'0') AS padOut'''
spark.sql(sql).show(truncate = False)

+----------+
|padOut    |
+----------+
|00000Rahul|
+----------+



In [None]:
## rpad : padding the string to the right with padding number

sql = '''SELECT rpad('Rahul',10,'0') AS padOut'''
spark.sql(sql).show(truncate = False)

+----------+
|padOut    |
+----------+
|Rahul00000|
+----------+



In [None]:
## mask : masking the column value on select

sql = '''SELECT mask('Rahul') AS maskOut'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT mask('Rahul','A','a','o') AS maskOut'''
spark.sql(sql).show(truncate = False)

+-------+
|maskOut|
+-------+
|Xxxxx  |
+-------+

+-------+
|maskOut|
+-------+
|Aaaaa  |
+-------+



In [None]:
## replace : replace a sub string with another string

sql = '''SELECT replace('rahul raj','raj','spark') as replacedString'''
spark.sql(sql).show(truncate = False)

+--------------+
|replacedString|
+--------------+
|rahul spark   |
+--------------+



In [None]:
## overlay : overlay a string with position and length

sql = '''SELECT overlay('rahul raj',' spark ', 6,1 ) as replacedString'''
spark.sql(sql).show(truncate = False)

+---------------+
|replacedString |
+---------------+
|rahul spark raj|
+---------------+



In [None]:
## regex_replace :replace the substring in a string with regex

sql = '''SELECT regexp_replace('123rahul567raj100yes89spark','[0-9]+','#') as regexReplaceOut'''
spark.sql(sql).show(truncate = False)

+--------------------+
|regexReplaceOut     |
+--------------------+
|#rahul#raj#yes#spark|
+--------------------+



In [None]:
# transalete : replece char by char using the from and to parameters

sql = '''select translate('rahul-aeiou','aou','###') as translateOut'''
spark.sql(sql).show(truncate = False)

sql = '''select translate('2025-10-25', '025', 'abc') as translateOut'''
spark.sql(sql).show(truncate = False)

+------------+
|translateOut|
+------------+
|r#h#l-#ei## |
+------------+

+------------+
|translateOut|
+------------+
|babc-1a-bc  |
+------------+



In [None]:
## regexp_count : count the occurances of the regular expressions in string

sql = '''SELECT regexp_count('rahul raj spark yes spark','spark') as regexCountOut'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT regexp_count('rahul raj spark yes spark','(?i)spark') as regexCountOut'''
spark.sql(sql).show(truncate = False)

+-------------+
|regexCountOut|
+-------------+
|2            |
+-------------+

+-------------+
|regexCountOut|
+-------------+
|2            |
+-------------+



In [None]:
## regexp_extract : extract the first sub-string that matches a regular expression from as string

sql = '''SELECT regexp_extract('rahul raj Spark yes spark','spark',0) as regexCountOut'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT regexp_extract('rahul raj Spark yes Spark','(?i)spark',0) as regexCountOut'''
spark.sql(sql).show(truncate = False)

+-------------+
|regexCountOut|
+-------------+
|spark        |
+-------------+

+-------------+
|regexCountOut|
+-------------+
|Spark        |
+-------------+



In [None]:
## regexp_extract : extract the first sub-string that matches a regular expression from as string
## 0 - stands for entire match

sql = '''SELECT regexp_extract_all('123rahul567raj100yes89spark','[0-9]',0) as regex_extract_allValue'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT regexp_extract_all('123rahul567raj100yes89spark','[0-9]+',0) as regex_extract_allValue'''
spark.sql(sql).show(truncate = False)

sql = '''
WITH CTE AS
(
SELECT regexp_extract_all('123rahul567raj100yes89spark','[0-9]+',0) as regex_extract_allValue
)
SELECT typeOf(regex_extract_allValue) as outputType
FROM CTE
'''

spark.sql(sql).show(truncate = False)

+---------------------------------+
|regex_extract_allValue           |
+---------------------------------+
|[1, 2, 3, 5, 6, 7, 1, 0, 0, 8, 9]|
+---------------------------------+

+----------------------+
|regex_extract_allValue|
+----------------------+
|[123, 567, 100, 89]   |
+----------------------+

+-------------+
|outputType   |
+-------------+
|array<string>|
+-------------+



In [None]:
## repeat : repeat a string column n times

sql = '''SELECT repeat('rahul',5) as repeatStringNTimes'''
spark.sql(sql).show(truncate = False)

+-------------------------+
|repeatStringNTimes       |
+-------------------------+
|rahulrahulrahulrahulrahul|
+-------------------------+



In [None]:
## left : extract the number of charectors from left side

sql = '''SELECT left('rahul',2) as leftOut'''
spark.sql(sql).show(truncate = False)

+-------+
|leftOut|
+-------+
|ra     |
+-------+



In [None]:
## right : extract the number of charectors from right side

sql = '''SELECT right('rahul',2) as leftOut'''
spark.sql(sql).show(truncate = False)

+-------+
|leftOut|
+-------+
|ul     |
+-------+



In [None]:
## sentances : split the paragraph into the array of sentances and nexted by array of words

sql = '''SELECT sentences('Hello world. Spark SQL is amazing! Let us learn.') as sentancesOut'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT sentences('Hello world') as sentancesOut'''
spark.sql(sql).show(truncate = False)

sql = '''
WITH CTE AS
(
SELECT sentences('Hello world') as sentancesOut
)
SELECT TypeOf(sentancesOut) AS outType
FROM CTE
'''
spark.sql(sql).show(truncate = False)

+-------------------------------------------------------------+
|sentancesOut                                                 |
+-------------------------------------------------------------+
|[[Hello, world], [Spark, SQL, is, amazing], [Let, us, learn]]|
+-------------------------------------------------------------+

+----------------+
|sentancesOut    |
+----------------+
|[[Hello, world]]|
+----------------+

+--------------------+
|outType             |
+--------------------+
|array<array<string>>|
+--------------------+



In [None]:
## Split : split the text using a delimeter or a regex pattern

sql = '''SELECT split('Hello world. Spark SQL is amazing! Let us learn.',' ') as splitOut'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT split('Hello world 1 Spark SQL is amazing 2 Let us learn.','[0-9]') as splitOut'''
spark.sql(sql).show(truncate = False)

sql = '''
WITH CTE AS
(
SELECT split('Hello world 1 Spark SQL is amazing 2 Let us learn.','[0-9]') as splitOut
)
SELECT TypeOf(splitOut) AS outType
FROM CTE
'''
spark.sql(sql).show(truncate = False)

+----------------------------------------------------------+
|splitOut                                                  |
+----------------------------------------------------------+
|[Hello, world., Spark, SQL, is, amazing!, Let, us, learn.]|
+----------------------------------------------------------+

+------------------------------------------------------+
|splitOut                                              |
+------------------------------------------------------+
|[Hello world ,  Spark SQL is amazing ,  Let us learn.]|
+------------------------------------------------------+

+-------------+
|outType      |
+-------------+
|array<string>|
+-------------+



In [None]:
## Split_part : split_part the text using a delimeter access the elment of the resulted array

sql = '''SELECT split_part('Rahul Raj PR',' ',2) as splitPartOut'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT split_part('Rahul Raj PR',' ',-1) as splitPartOut''' ## access the laste element with -1 notation
spark.sql(sql).show(truncate = False)

+------------+
|splitPartOut|
+------------+
|Raj         |
+------------+

+------------+
|splitPartOut|
+------------+
|PR          |
+------------+



In [None]:
#substr : substring function to extratct the substring from a string starting from a given index and to a give length
## synonym for substring

sql = '''SELECT substr('Rahul Raj',2,3) as substrout'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT substr('Rahul Raj' FROM 2 FOR 3) as substrout'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT substring('Rahul Raj',2,3) as substrout'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT substring('Rahul Raj' FROM 2 FOR 3) as substrout'''
spark.sql(sql).show(truncate = False)

+---------+
|substrout|
+---------+
|ahu      |
+---------+

+---------+
|substrout|
+---------+
|ahu      |
+---------+

+---------+
|substrout|
+---------+
|ahu      |
+---------+

+---------+
|substrout|
+---------+
|ahu      |
+---------+



In [None]:
# substring_index: Splits a string using the given delimiter and
# returns the substring based on the specified occurrence.
# - If the count is positive, it returns everything to the left of the nth occurrence of the delimiter.
# - If the count is negative, it returns everything to the right of the nth occurrence from the end.

sql = '''SELECT substring_index('rahul-raj-pr','-',1) as substing_indexOut'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT substring_index('rahul-raj-pr','-',2) as substing_indexOut'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT substring_index('rahul-raj-pr','-',-1) as substing_indexOut'''
spark.sql(sql).show(truncate = False)

sql = '''SELECT substring_index('rahul-raj-pr','-',-2) as substing_indexOut'''
spark.sql(sql).show(truncate = False)

+-----------------+
|substing_indexOut|
+-----------------+
|rahul            |
+-----------------+

+-----------------+
|substing_indexOut|
+-----------------+
|rahul-raj        |
+-----------------+

+-----------------+
|substing_indexOut|
+-----------------+
|pr               |
+-----------------+

+-----------------+
|substing_indexOut|
+-----------------+
|raj-pr           |
+-----------------+

