In [20]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import BooleanType, StringType, IntegerType

spark = SparkSession.builder.appName('sql').getOrCreate()

In [15]:
# Create DataFrame with names of different lengths
df = spark.createDataFrame([
    (1, "Alice", 2),
    (2, "Bob", 5),
    (3, "Carolynn", 8),  # Longer name
    (4, "David", 11),
    (5, "Eve", 22),  # Shorter name
    (6, "Franklin", 33)  # Longer name
], ['id', 'name', 'age'])

df \
    .withColumnRenamed('id', 'identifier') \
    .withColumn('name_length', expr('length(name)')) \
    .filter(col('name_length') > 3) \
    .orderBy('name', ascending=False) \
    .show()

+----------+--------+---+-----------+
|identifier|    name|age|name_length|
+----------+--------+---+-----------+
|         6|Franklin| 33|          8|
|         4|   David| 11|          5|
|         3|Carolynn|  8|          8|
|         1|   Alice|  2|          5|
+----------+--------+---+-----------+


In [16]:
# Create DataFrame with names of different lengths
df = spark.createDataFrame([
    (1, "Alice", 2),
    (2, "Bob", 5),
    (3, "Carolynn", 8),  # Longer name
    (4, "David", 11),
    (5, "Eve", 22),  # Shorter name
    (6, "Franklin", 33)  # Longer name
], ['id', 'name', 'age'])

df \
    .withColumnRenamed('id', 'identifier') \
    .withColumn('name_length', length('name')) \
    .filter(col('name_length') > 3) \
    .orderBy('name', ascending=False) \
    .show()

+----------+--------+---+-----------+
|identifier|    name|age|name_length|
+----------+--------+---+-----------+
|         6|Franklin| 33|          8|
|         4|   David| 11|          5|
|         3|Carolynn|  8|          8|
|         1|   Alice|  2|          5|
+----------+--------+---+-----------+


In [19]:
# Create DataFrame with names of different lengths
df = spark.createDataFrame([
    (1, "Alice", 2),
    (2, "Bob", 5),
    (3, "Carolynn", 8),  # Longer name
    (4, "David", 11),
    (5, "Eve", 22),  # Shorter name
    (6, "Franklin", 33)  # Longer name
], ['id', 'name', 'age'])

df \
    .withColumnRenamed('id', 'identifier') \
    .withColumn('name_length', length(col('name'))) \
    .filter(col('name_length') > 3) \
    .orderBy('name', ascending=False) \
    .show()

+----------+--------+---+-----------+
|identifier|    name|age|name_length|
+----------+--------+---+-----------+
|         6|Franklin| 33|          8|
|         4|   David| 11|          5|
|         3|Carolynn|  8|          8|
|         1|   Alice|  2|          5|
+----------+--------+---+-----------+


In [14]:
# Create DataFrame with names of different lengths
df = spark.createDataFrame([
    (1, "Alice", 2),
    (2, "Bob", 5),
    (3, "Carolynn", 8),  # Longer name
    (4, "David", 11),
    (5, "Eve", 22),  # Shorter name
    (6, "Franklin", 33)  # Longer name
], ['id', 'name', 'age'])

df \
    .withColumnRenamed('id', 'identifier') \
    .withColumn('name_length', length(df.name)) \
    .filter(col('name_length') > 3) \
    .orderBy('name', ascending=False) \
    .show()

+----------+-----+---+-----------+
|identifier| name|age|name_length|
+----------+-----+---+-----------+
|         4|David| 11|          5|
|         3|Carol|  8|          5|
|         1|Alice|  2|          5|
+----------+-----+---+-----------+


In [8]:
df = spark.createDataFrame([
    (1, "Alice", 2),
    (2, "Bob", 5),
    (3, "Carolynn", 8),  # Longer name
    (4, "David", 11),
    (5, "Eve", 22),  # Shorter name
    (6, "Franklin", 33)  # Longer name
], ['id', 'name', 'age'])

df.printSchema()

extended_df = df \
    .withColumnRenamed('id', 'identifier') \
    .withColumn('name_length', length('name')) \
    .withColumn('starts_with_vowel', expr("LOWER(SUBSTR(name, 0, 1)) in ('a', 'e', 'i', 'o', 'u')"))

# built-in functions
# https://spark.apache.org/docs/latest/api/sql/index.html


extended_df.printSchema()

extended_df.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
+----------+--------+---+-----------+-----------------+
|identifier|    name|age|name_length|starts_with_vowel|
+----------+--------+---+-----------+-----------------+
|         1|   Alice|  2|          5|             true|
|         2|     Bob|  5|          3|            false|
|         3|Carolynn|  8|          8|            false|
|         4|   David| 11|          5|            false|
|         5|     Eve| 22|          3|             true|
|         6|Franklin| 33|          8|            false|
+----------+--------+---+-----------+-----------------+

root
 |-- identifier: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- name_length: integer (nullable = true)
 |-- starts_with_vowel: boolean (nullable = true)


In [11]:
df = spark.createDataFrame([
    (1, "Alice", 2),
    (2, "Bob", 5),
    (3, "Carolynn", 8),  # Longer name
    (4, "David", 11),
    (5, "Eve", 22),  # Shorter name
    (6, "Franklin", 33)  # Longer name
], ['id', 'name', 'age'])

extended_df = df \
    .withColumnRenamed('id', 'identifier') \
    .withColumn('name_length', length('name')) \
    .withColumn('starts_with_vowel', lower(substring('name', 0, 1)).isin(['a', 'e', 'i', 'o', 'u'])) \
    .withColumn('age_group', when(col('age') < 13, 'child')
                .when(col('age') < 20, 'teen')
                .otherwise('adult'))

extended_df.show()

+----------+--------+---+-----------+-----------------+---------+
|identifier|    name|age|name_length|starts_with_vowel|age_group|
+----------+--------+---+-----------+-----------------+---------+
|         1|   Alice|  2|          5|             true|    child|
|         2|     Bob|  5|          3|            false|    child|
|         3|Carolynn|  8|          8|            false|    child|
|         4|   David| 11|          5|            false|    child|
|         5|     Eve| 22|          3|             true|    adult|
|         6|Franklin| 33|          8|            false|    adult|
+----------+--------+---+-----------+-----------------+---------+


In [13]:
def starts_with_vowel(name):
    return name[0].lower() in ['a', 'e', 'i', 'o', 'u']


starts_with_vowel_udf = udf(starts_with_vowel, BooleanType())

# Create DataFrame with names of different lengths
df = spark.createDataFrame([
    (1, "Alice", 2),
    (2, "Bob", 5),
    (3, "Carolynn", 8),  # Longer name
    (4, "David", 11),
    (5, "Eve", 22),  # Shorter name
    (6, "Franklin", 33)  # Longer name
], ['id', 'name', 'age'])

extended_df = df \
    .withColumnRenamed('id', 'identifier') \
    .withColumn('name_length', length('name')) \
    .withColumn('starts_with_vowel', starts_with_vowel_udf('name')) \
    .withColumn('age_group', when(df.age < 13, 'child')
                .when(df.age < 20, 'teen')
                .otherwise('adult')) \
    .filter(col('name_length') > 3) \
    .orderBy('name', ascending=False)

extended_df.show()

+----------+--------+---+-----------+-----------------+---------+
|identifier|    name|age|name_length|starts_with_vowel|age_group|
+----------+--------+---+-----------+-----------------+---------+
|         6|Franklin| 33|          8|            false|    adult|
|         4|   David| 11|          5|            false|    child|
|         3|Carolynn|  8|          8|            false|    child|
|         1|   Alice|  2|          5|             true|    child|
+----------+--------+---+-----------+-----------------+---------+


In [3]:
starts_with_vowel_udf = udf(lambda name: name[0].lower() in ['a', 'e', 'i', 'o', 'u'], BooleanType())

# Create DataFrame with names of different lengths
df = spark.createDataFrame([
    (1, "Alice", 2),
    (2, "Bob", 5),
    (3, "Carolynn", 8),  # Longer name
    (4, "David", 11),
    (5, "Eve", 22),  # Shorter name
    (6, "Franklin", 33)  # Longer name
], ['id', 'name', 'age'])

extended_df = df \
    .withColumnRenamed('id', 'identifier') \
    .withColumn('name_length', length('name')) \
    .withColumn('starts_with_vowel', starts_with_vowel_udf('name')) \
    .withColumn('age_group', when(col('age') < 13, 'child')
                .when(df.age < 20, 'teen')
                .otherwise('adult')) \
    .filter(col('name_length') > 3) \
    .orderBy('name', ascending=False)

extended_df.show()

                                                                                

+----------+--------+---+-----------+-----------------+---------+
|identifier|    name|age|name_length|starts_with_vowel|age_group|
+----------+--------+---+-----------+-----------------+---------+
|         6|Franklin| 33|          8|            false|    adult|
|         4|   David| 11|          5|            false|    child|
|         3|Carolynn|  8|          8|            false|    child|
|         1|   Alice|  2|          5|             true|    child|
+----------+--------+---+-----------+-----------------+---------+


In [5]:
starts_with_vowel_udf = udf(lambda name: name[0].lower() in ['a', 'e', 'i', 'o', 'u'], BooleanType())

age_group_udf = udf(lambda age: 'child' if age < 13 else 'teen' if age < 20 else 'adult', StringType())

# Create DataFrame with names of different lengths
df = spark.createDataFrame([
    (1, "Alice", 2),
    (2, "Bob", 5),
    (3, "Carolynn", 8),  # Longer name
    (4, "David", 11),
    (5, "Eve", 22),  # Shorter name
    (6, "Franklin", 33)  # Longer name
], ['id', 'name', 'age'])

extended_df = df \
    .withColumnRenamed('id', 'identifier') \
    .withColumn('name_length', expr("length(name)")) \
    .withColumn('starts_with_vowel', starts_with_vowel_udf('name')) \
    .withColumn('age_group', age_group_udf('age')) \
    .filter(col('name_length') > 3) \
    .orderBy('name', ascending=False)

extended_df.show()

[Stage 2:>                                                        (0 + 10) / 10]

+----------+--------+---+-----------+-----------------+---------+
|identifier|    name|age|name_length|starts_with_vowel|age_group|
+----------+--------+---+-----------+-----------------+---------+
|         6|Franklin| 33|          8|            false|    adult|
|         4|   David| 11|          5|            false|    child|
|         3|Carolynn|  8|          8|            false|    child|
|         1|   Alice|  2|          5|             true|    child|
+----------+--------+---+-----------+-----------------+---------+


                                                                                

In [7]:
starts_with_vowel_udf = udf(lambda name: name[0].lower() in ['a', 'e', 'i', 'o', 'u'], BooleanType())

age_group_udf = udf(lambda age: 'child' if age < 13 else 'teen' if age < 20 else 'adult', StringType())

string_length_udf = udf(lambda name: len(name), IntegerType())

# Create DataFrame with names of different lengths
df = spark.createDataFrame([
    (1, "Alice", 2),
    (2, "Bob", 5),
    (3, "Carolynn", 8),  # Longer name
    (4, "David", 11),
    (5, "Eve", 22),  # Shorter name
    (6, "Franklin", 33)  # Longer name
], ['id', 'name', 'age'])

extended_df = df \
    .withColumnRenamed('id', 'identifier') \
    .withColumn('name_length', string_length_udf('name')) \
    .withColumn('starts_with_vowel', starts_with_vowel_udf('name')) \
    .withColumn('age_group', age_group_udf('age')) \
    .filter(col('name_length') > 3) \
    .orderBy('name', ascending=False)

extended_df.show()

[Stage 3:>                                                        (0 + 10) / 10]

+----------+--------+---+-----------+-----------------+---------+
|identifier|    name|age|name_length|starts_with_vowel|age_group|
+----------+--------+---+-----------+-----------------+---------+
|         6|Franklin| 33|          8|            false|    adult|
|         4|   David| 11|          5|            false|    child|
|         3|Carolynn|  8|          8|            false|    child|
|         1|   Alice|  2|          5|             true|    child|
+----------+--------+---+-----------+-----------------+---------+


                                                                                

In [12]:
def starts_with_vowel(name):
    return name[0].lower() in ['a', 'e', 'i', 'o', 'u']


def age_group(age):
    if age < 13:
        return 'child'
    elif age < 20:
        return 'teen'
    else:
        return 'adult'


def string_length(name):
    return len(name) if name else None


# Convert Python functions to UDFs
starts_with_vowel_udf = udf(starts_with_vowel, BooleanType())
age_group_udf = udf(age_group, StringType())
string_length_udf = udf(string_length, IntegerType())

# Create DataFrame with names of different lengths
df = spark.createDataFrame([
    (1, "Alice", 2),
    (2, "Bob", 5),
    (3, "Carolynn", 8),  # Longer name
    (4, "David", 11),
    (5, "Eve", 22),  # Shorter name
    (6, "Franklin", 33)  # Longer name
], ['id', 'name', 'age'])

extended_df = df \
    .withColumnRenamed('id', 'identifier') \
    .withColumn('name_length', string_length_udf('name')) \
    .withColumn('starts_with_vowel', starts_with_vowel_udf('name')) \
    .withColumn('age_group', age_group_udf('age')) \
    .filter(col('name_length') > 3) \
    .orderBy('name', ascending=False)

extended_df.show()

+----------+--------+---+-----------+-----------------+---------+
|identifier|    name|age|name_length|starts_with_vowel|age_group|
+----------+--------+---+-----------+-----------------+---------+
|         6|Franklin| 33|          8|            false|    adult|
|         4|   David| 11|          5|            false|    child|
|         3|Carolynn|  8|          8|            false|    child|
|         1|   Alice|  2|          5|             true|    child|
+----------+--------+---+-----------+-----------------+---------+


                                                                                

In [14]:
def with_starts_with_vowel(df_transform: DataFrame) -> DataFrame:
    return df_transform.withColumn('starts_with_vowel',
                                   lower(substring(col('name'), 0, 1)).isin(['a', 'e', 'i', 'o', 'u']))


def with_age_group(df_transform: DataFrame) -> DataFrame:
    return df_transform.withColumn('age_group', when(col('age') < 13, 'child')
                                   .when(col('age') < 20, 'teen')
                                   .otherwise('adult'))


def with_name_length(df_transform: DataFrame) -> DataFrame:
    return df_transform.withColumn('name_length', length(col('name')))


# Create DataFrame with names of different lengths
df = spark.createDataFrame([
    (1, "Alice", 2),
    (2, "Bob", 5),
    (3, "Carolynn", 8),  # Longer name
    (4, "David", 11),
    (5, "Eve", 22),  # Shorter name
    (6, "Franklin", 33)  # Longer name
], ['id', 'name', 'age'])

extended_df = df \
    .withColumnRenamed('id', 'identifier') \
    .transform(with_name_length) \
    .transform(with_starts_with_vowel) \
    .transform(with_age_group) \
    .filter(col('name_length') > 3) \
    .orderBy('name', ascending=False)

extended_df.show()

+----------+--------+---+-----------+-----------------+---------+
|identifier|    name|age|name_length|starts_with_vowel|age_group|
+----------+--------+---+-----------+-----------------+---------+
|         6|Franklin| 33|          8|            false|    adult|
|         4|   David| 11|          5|            false|    child|
|         3|Carolynn|  8|          8|            false|    child|
|         1|   Alice|  2|          5|             true|    child|
+----------+--------+---+-----------+-----------------+---------+


In [25]:
def with_starts_with_vowel(df_transform: DataFrame) -> DataFrame:
    return df_transform.withColumn('starts_with_vowel',
                                   lower(substring(col('name'), 0, 1)).isin(['a', 'e', 'i', 'o', 'u']))


def with_age_group(df_transform: DataFrame) -> DataFrame:
    return df_transform.withColumn('age_group', when(col('age') < 13, 'child')
                                   .when(col('age') < 20, 'teen')
                                   .otherwise('adult'))


def with_name_length(df_transform: DataFrame) -> DataFrame:
    return df_transform.withColumn('name_length', length(col('name')))


def rename_columns(df_transform: DataFrame, new_column_names: dict) -> DataFrame:
    for old_name, new_name in new_column_names.items():
        df_transform = df_transform.withColumnRenamed(old_name, new_name)
    return df_transform


def filter_by_name_length(df_transform: DataFrame, min_length: int) -> DataFrame:
    return df_transform.filter(col('name_length') > min_length)


def sort_by_name(df_transform: DataFrame, ascending: bool = True) -> DataFrame:
    return df_transform.orderBy('name', ascending=ascending)


def transform(df_transform: DataFrame, *transform_functions) -> DataFrame:
    for transform_function in transform_functions:
        df_transform = transform_function(df_transform)
    return df_transform


# Create DataFrame with names of different lengths
df = spark.createDataFrame([
    (1, "Alice", 2),
    (2, "Bob", 5),
    (3, "Carolynn", 8),  # Longer name
    (4, "David", 11),
    (5, "Eve", 22),  # Shorter name
    (6, "Franklin", 33)  # Longer name
], ['id', 'name', 'age'])

extended_df = transform(df,
                        with_name_length,
                        with_starts_with_vowel,
                        with_age_group,
                        lambda dff: filter_by_name_length(dff, 3),
                        lambda dff: rename_columns(dff, {'id': 'identifier'}),
                        lambda dff: sort_by_name(dff, ascending=False))

extended_df.show()

+----------+--------+---+-----------+-----------------+---------+
|identifier|    name|age|name_length|starts_with_vowel|age_group|
+----------+--------+---+-----------+-----------------+---------+
|         6|Franklin| 33|          8|            false|    adult|
|         4|   David| 11|          5|            false|    child|
|         3|Carolynn|  8|          8|            false|    child|
|         1|   Alice|  2|          5|             true|    child|
+----------+--------+---+-----------+-----------------+---------+
