# String Fun

ðŸ“˜ PySpark String Functions â€“ Table Format

In [None]:
| Function                                    | Type of Function | Description                                   | Example                                          |
| ------------------------------------------- | ---------------- | --------------------------------------------- | ------------------------------------------------ |
| `lower(col)`                                | String Case      | Converts all characters to lowercase          | `df.select(lower("text"))`                       |
| `upper(col)`                                | String Case      | Converts all characters to uppercase          | `df.select(upper("text"))`                       |
| `initcap(col)`                              | String Case      | Converts first letter of each word to capital | `df.select(initcap("text"))`                     |
| `length(col)`                               | Measurement      | Returns length of string                      | `df.select(length("text"))`                      |
| `trim(col)`                                 | Whitespace       | Removes leading + trailing spaces             | `df.select(trim("text"))`                        |
| `ltrim(col)`                                | Whitespace       | Removes leading spaces                        | `df.select(ltrim("text"))`                       |
| `rtrim(col)`                                | Whitespace       | Removes trailing spaces                       | `df.select(rtrim("text"))`                       |
| `concat(col1, col2)`                        | Combine          | Joins multiple columns                        | `df.select(concat("text", lit("!")))`            |
| `concat_ws(sep, cols)`                      | Combine          | Joins columns with a separator                | `df.select(concat_ws("-", "id", "text"))`        |
| `substring(col, start, len)`                | Extract          | Extracts part of string (1-based)             | `df.select(substring("text", 1, 4))`             |
| `substr(col, start, len)`                   | Extract          | Same as substring                             | `df.select(substr("text", 1, 4))`                |
| `split(col, regex)`                         | Split            | Splits string into array                      | `df.select(split("text", " "))`                  |
| `regexp_extract(col, pattern, idx)`         | Regex            | Extracts regex match                          | `df.select(regexp_extract("text", "(\\w+)", 0))` |
| `regexp_replace(col, pattern, replacement)` | Regex            | Replace using regex pattern                   | `df.select(regexp_replace("text", " ", "_"))`    |
| `replace(col, old, new)`                    | Replace          | Replace exact substring (non-regex)           | `df.select(expr("replace(text, 'a', 'X')"))`     |
| `translate(col, match, replace)`            | Replace          | Replace multiple characters in one call       | `df.select(translate("text", "ae", "12"))`       |
| `instr(col, substring)`                     | Search           | Returns index of substring (1-based)          | `df.select(instr("text", "a"))`                  |
| `locate(substr, col)`                       | Search           | Same as instr but params reversed             | `df.select(locate("a", "text"))`                 |
| `reverse(col)`                              | Transformation   | Reverses string                               | `df.select(reverse("text"))`                     |
| `repeat(col, n)`                            | Transformation   | Repeats string n times                        | `df.select(repeat("text", 3))`                   |


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame([
    (1, "  hello World  "),
    (2, "Spark SQL Functions"),
    (3, "data ENGINEER"),
    (4, None)
], ["id", "text"])


# ðŸ”¤ 1. lower()
# Converts string to lowercase.
df.select(lower("text")).show()

# ðŸ”¤ 2. upper()
# Converts string to uppercase.
df.select(upper("text")).show()

# ðŸ”¤ 3. length()
# Counts characters.
df.select(length("text")).show()

# ðŸ”¤ 4. trim()
# Removes spaces from both sides.
df.select(trim("text")).show()

# ðŸ”¤ 5. ltrim()
# Left-side trim.
df.select(ltrim("text")).show()

# ðŸ”¤ 6. rtrim()
# Right-side trim.
df.select(rtrim("text")).show()

# ðŸ”¤ 7. concat()
# Concatenates columns.
df.select(concat(col("text"), lit("!!!"))).show()

# ðŸ”¤ 8. concat_ws()
# Concatenates using a separator.
df.select(concat_ws(" - ", "id", "text")).show()

# ðŸ”¤ 9. substring()
# Extract a part of string.
# (start index 1-based)
df.select(substring("text", 1, 5)).show()

# ðŸ”¤ 10. substr() (alias)
# Same as substring.
# ðŸ”¤ 11. split()
# Splits by delimiter â†’ returns array.
df.select(split("text", " ")).show(truncate=False)

# ðŸ”¤ 12. regexp_extract()
# Extract using regex pattern.
df.select(regexp_extract("text", "(\\w+)", 0)).show()

# ðŸ”¤ 13. regexp_replace()
# Replace using regex.
df.select(regexp_replace("text", " ", "_")).show()

# ðŸ”¤ 14. replace()
# Simple replace (non-regex).
df.select(expr("replace(text, ' ', '-')")).show()

# ðŸ”¤ 15. translate()
# Replaces multiple characters.
df.select(translate("text", "aeiou", "12345")).show()

# ðŸ”¤ 16. instr()
# Find position of substring.
df.select(instr("text", "a")).show()

# ðŸ”¤ 18. initcap()
# Converts to Title Case.
df.select(initcap("text")).show()

# ðŸ”¤ 19. reverse()
# Reverse the string.

df.select(reverse("text")).show()

# ðŸ”¤ 20. format_string()
# String formatting like printf.

df.select(format_string("ID: %d, Text: %s", "id", "text")).show()

# ðŸ”¤ 21. repeat()
# Repeat a string N times.

df.select(repeat("text", 2)).show()

In [1]:
df.select(current_timestamp())

NameError: name 'df' is not defined