You have a DataFrame with a column DateOfBirth in yyyy-MM-dd format. Add a new column Age that calculates the age of each individual as of the current date.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, current_date, when

# Create a Spark session
spark = SparkSession.builder.appName("Calculate Age").getOrCreate()

# Sample data
data = [
    ("1990-01-01",),
    ("1985-05-20",),
    ("2000-12-15",),
    ("2019-01-04",)
]

# Create a DataFrame with the DOB data
columns = ["dob"]
df = spark.createDataFrame(data, columns)

# Calculate the age
df = df.withColumn("dob", col("dob").cast("date"))  # Ensure dob column is of type date
df = df.withColumn("age", 
    year(current_date()) - year(col("dob")) - 
    when(
        (month(current_date()) < month(col("dob"))) | 
        ((month(current_date()) == month(col("dob"))) & (dayofmonth(current_date()) < dayofmonth(col("dob")))), 
        1
    ).otherwise(0)
)

# Show the DataFrame
df.show()


+----------+---+
|       dob|age|
+----------+---+
|1990-01-01| 34|
|1985-05-20| 39|
|2000-12-15| 23|
|2019-01-04|  5|
+----------+---+

