In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

spark = SparkSession.builder \
    .appName('sql') \
    .getOrCreate()

In [4]:
df = spark.createDataFrame([(2, 'Alice'), (5, 'Bob')], ['age', 'name'])
df.select(df.name, df.age).show()

+-----+---+
| name|age|
+-----+---+
|Alice|  2|
|  Bob|  5|
+-----+---+


In [5]:
df = spark.createDataFrame([(2, 'Alice'), (5, 'Bob')], ['age', 'name'])
df.select(df.name, df.age, df.age + 1).show()

+-----+---+---------+
| name|age|(age + 1)|
+-----+---+---------+
|Alice|  2|        3|
|  Bob|  5|        6|
+-----+---+---------+


In [6]:
df = spark.createDataFrame([(2, 'Alice'), (5, 'Bob')], ['age', 'name'])
df.select(df.name, df.age, (df.age + 1).alias('agePlusOne')).show()

+-----+---+----------+
| name|age|agePlusOne|
+-----+---+----------+
|Alice|  2|         3|
|  Bob|  5|         6|
+-----+---+----------+


In [7]:
df = spark.createDataFrame([(2, 'Alice'), (5, 'Bob')], ['age', 'name'])
df.select(
    df.name,
    df.age,
    (df.age + 1).alias('agePlusOne')
).filter(df.age > 3).show()

+----+---+----------+
|name|age|agePlusOne|
+----+---+----------+
| Bob|  5|         6|
+----+---+----------+


In [8]:
df = spark.createDataFrame([(2, 'Alice'), (5, 'Bob')], ['age', 'name'])
df_filtered = df.select(
    df.name,
    df.age,
    (df.age + 1).alias('agePlusOne')
).filter(df.age > 3).collect()

for row in df_filtered:
    print(f'{row.name} is {row.age} years old, and will be {row.agePlusOne} years old next year')

Bob is 5 years old, and will be 6 years old next year


In [9]:
df = spark.createDataFrame([(2, 'Alice'), (5, 'Bob')], ['age', 'name'])
df_filtered = df.select(
    df.name,
    df.age,
    (df.age + 1).alias('agePlusOne')
).filter(df.age > 3).collect()
rows = map(lambda r: f'{r.name} is {r.age} years old, and will be {r.agePlusOne} years old next year',
           df_filtered)

for row in rows:
    print(row)

Bob is 5 years old, and will be 6 years old next year


In [10]:
df = spark.createDataFrame([(2, 'Alice'), (5, 'Bob')], ['age', 'name'])
df_filtered = df.select(
    df.name,
    df.age,
    (df.age + 1).alias('agePlusOne')
).filter(df.age > 3).collect()
rows_dict = map(lambda r: r.asDict(), df_filtered)

for row in rows_dict:
    print(row)
    print(f'{row["name"]} is {row["age"]} years old, and will be {row["agePlusOne"]} years old next year')

{'name': 'Bob', 'age': 5, 'agePlusOne': 6}
Bob is 5 years old, and will be 6 years old next year


In [11]:
def print_person_details(person):
    return f'{person.name} is {person.age} years old, and will be {person.agePlusOne} years old next year'


df = spark.createDataFrame([(2, 'Alice'), (5, 'Bob')], ['age', 'name'])
df_filtered = df.select(
    df.name,
    df.age,
    (df.age + 1).alias('agePlusOne')
).filter(df.age > 3).collect()
rows = map(lambda r: print_person_details(r), df_filtered)

for row in rows:
    print(row)

Bob is 5 years old, and will be 6 years old next year


In [18]:
def print_person_details(person):
    return f'{person["name"]} is {person["age"]} years old, and will be {person["agePlusOne"]} years old next year'


df = spark.createDataFrame([(2, 'Alice'), (5, 'Bob')], ['age', 'name'])
df_filtered = df.select(
    df.name,
    df.age,
    (col('age') + 1).alias('agePlusOne')
).filter(col('age') > 3).collect()

rows = map(lambda r: r.asDict(), df_filtered)

for row in rows:
    print(row)
    print(print_person_details(row))

{'name': 'Bob', 'age': 5, 'agePlusOne': 6}
Bob is 5 years old, and will be 6 years old next year


In [13]:
df = spark.createDataFrame([(2, 'Alice'), (5, 'Bob')], ['age', 'name'])

age_plus_one_udf = udf(lambda age: age + 1, IntegerType())

df.withColumn('agePlusOne', age_plus_one_udf(df.age)).show()

+---+-----+----------+
|age| name|agePlusOne|
+---+-----+----------+
|  2|Alice|         3|
|  5|  Bob|         6|
+---+-----+----------+


In [14]:
df = spark.createDataFrame([(2, 'Alice'), (5, 'Bob')], ['age', 'name'])

age_plus_one_udf = udf(lambda age: age + 1, IntegerType())

# V1
df.select(
    df.name,
    df.age,
    age_plus_one_udf(df.age).alias('agePlusOne')
).filter(df.age > 3).show()

# V2
df.filter(df.age > 3).withColumn('agePlusOne', age_plus_one_udf(df.age)).show()

+----+---+----------+
|name|age|agePlusOne|
+----+---+----------+
| Bob|  5|         6|
+----+---+----------+
+---+----+----------+
|age|name|agePlusOne|
+---+----+----------+
|  5| Bob|         6|
+---+----+----------+


In [15]:
df = spark.createDataFrame([(2, 'Alice'), (5, 'Bob')], ['age', 'name'])

age_plus_one_udf = udf(lambda age: age + 1, IntegerType())

df_filtered = df \
    .withColumn('agePlusOne', age_plus_one_udf(df.age)) \
    .filter(df.age > 3) \
    .collect()

rows = [row.asDict() for row in df_filtered]

for row in rows:
    print(row)
    print(f'{row["name"]} is {row["age"]} years old, and will be {row["agePlusOne"]} years old next year')

{'age': 5, 'name': 'Bob', 'agePlusOne': 6}
Bob is 5 years old, and will be 6 years old next year


In [16]:
df = spark.createDataFrame([(2, 'Alice'), (5, 'Bob')], ['age', 'name'])

age_plus_one_udf = udf(lambda age: age + 1, IntegerType())

df_filtered = df \
    .withColumn('agePlusOne', age_plus_one_udf(col('age'))) \
    .filter(col('age') > 3) \
    .collect()

rows = [row.asDict() for row in df_filtered]

for row in rows:
    print(row)
    print(f'{row["name"]} is {row["age"]} years old, and will be {row["agePlusOne"]} years old next year')

{'age': 5, 'name': 'Bob', 'agePlusOne': 6}
Bob is 5 years old, and will be 6 years old next year
