In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()
data = [
    (1, "alice john", " alice@gmail.com "),
    (2, "Bob Smith", "bob@yahoo.com"),
    (3, "CHARLIE", None)
]

cols = ["id", "name", "email"]
df = spark.createDataFrame(data, cols)
df.show(truncate=False)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/16 09:30:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/16 09:30:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/01/16 09:30:18 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
                                                                                

+---+----------+-----------------+
|id |name      |email            |
+---+----------+-----------------+
|1  |alice john| alice@gmail.com |
|2  |Bob Smith |bob@yahoo.com    |
|3  |CHARLIE   |NULL             |
+---+----------+-----------------+



upper / lower

Definition: Converts string to uppercase or lowercase.

In [3]:
df.select(lower("name"),upper("name")).show()

+-----------+-----------+
|lower(name)|upper(name)|
+-----------+-----------+
| alice john| ALICE JOHN|
|  bob smith|  BOB SMITH|
|    charlie|    CHARLIE|
+-----------+-----------+



trim / ltrim / rtrim

Definition: Removes whitespace from both/left/right sides of string.

In [4]:
df.withColumn("clean_email",trim("email")).show()

+---+----------+-----------------+---------------+
| id|      name|            email|    clean_email|
+---+----------+-----------------+---------------+
|  1|alice john| alice@gmail.com |alice@gmail.com|
|  2| Bob Smith|    bob@yahoo.com|  bob@yahoo.com|
|  3|   CHARLIE|             NULL|           NULL|
+---+----------+-----------------+---------------+



length

Definition: Returns the number of characters in a string.


In [5]:
df.select("name",length("name").alias("len")).show()

+----------+---+
|      name|len|
+----------+---+
|alice john| 10|
| Bob Smith|  9|
|   CHARLIE|  7|
+----------+---+



substring

Definition: Extracts part of a string using position and length.

In [6]:
df.select(substring("name",1,5)).show()

+---------------------+
|substring(name, 1, 5)|
+---------------------+
|                alice|
|                Bob S|
|                CHARL|
+---------------------+



concat / concat_ws

Definition: Concatenates multiple strings.

In [9]:
df.select(concat_ws(" ", "id", "name")).show()

+----------------------+
|concat_ws( , id, name)|
+----------------------+
|          1 alice john|
|           2 Bob Smith|
|             3 CHARLIE|
+----------------------+



like / rlike

Definition: Pattern matching (SQL LIKE / regex).

In [8]:
df.filter(df.email.like("%gmail%")).show()
df.filter(df.email.rlike("yahoo")).show()

+---+----------+-----------------+
| id|      name|            email|
+---+----------+-----------------+
|  1|alice john| alice@gmail.com |
+---+----------+-----------------+

+---+---------+-------------+
| id|     name|        email|
+---+---------+-------------+
|  2|Bob Smith|bob@yahoo.com|
+---+---------+-------------+



split

Definition: Splits a string into an array based on delimiter.

In [10]:
df.withColumn("email_parts", split("email", "@")).show()

+---+----------+-----------------+--------------------+
| id|      name|            email|         email_parts|
+---+----------+-----------------+--------------------+
|  1|alice john| alice@gmail.com |[ alice, gmail.com ]|
|  2| Bob Smith|    bob@yahoo.com|    [bob, yahoo.com]|
|  3|   CHARLIE|             NULL|                NULL|
+---+----------+-----------------+--------------------+



regexp_replace

Definition: Replaces part of a string using regex.

In [11]:
df.withColumn("email_clean",
    regexp_replace("email", "gmail.com", "company.com")
).show()

+---+----------+-----------------+-------------------+
| id|      name|            email|        email_clean|
+---+----------+-----------------+-------------------+
|  1|alice john| alice@gmail.com | alice@company.com |
|  2| Bob Smith|    bob@yahoo.com|      bob@yahoo.com|
|  3|   CHARLIE|             NULL|               NULL|
+---+----------+-----------------+-------------------+



replace

Definition: Replaces literal string (not regex).

In [14]:
df.show()
df.printSchema()

+---+----------+-----------------+
| id|      name|            email|
+---+----------+-----------------+
|  1|alice john| alice@gmail.com |
|  2| Bob Smith|    bob@yahoo.com|
|  3|   CHARLIE|             NULL|
+---+----------+-----------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)



In [15]:


df.withColumn(
    "name_fixed",
    regexp_replace("name", "alice", "Alice")
).show()


+---+----------+-----------------+----------+
| id|      name|            email|name_fixed|
+---+----------+-----------------+----------+
|  1|alice john| alice@gmail.com |Alice john|
|  2| Bob Smith|    bob@yahoo.com| Bob Smith|
|  3|   CHARLIE|             NULL|   CHARLIE|
+---+----------+-----------------+----------+



isNull / isNotNull (String handling)

Definition: Checks NULL values in string columns.

In [16]:
df.filter(df.email.isNotNull()).show()

+---+----------+-----------------+
| id|      name|            email|
+---+----------+-----------------+
|  1|alice john| alice@gmail.com |
|  2| Bob Smith|    bob@yahoo.com|
+---+----------+-----------------+



In [17]:
df.filter(df.email.isNull()).show()

+---+-------+-----+
| id|   name|email|
+---+-------+-----+
|  3|CHARLIE| NULL|
+---+-------+-----+

