# Add new columns to a PySpark dataframe

Useful links: 
- [SparkByExamples](https://sparkbyexamples.com/pyspark/pyspark-add-new-column-to-dataframe/)

In [0]:
from datetime import date
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.getOrCreate()

# Let's create test data
df = spark.createDataFrame([
    Row(SUBJID='1', AGE=25, SEX='F',  HEIGHT= 150, SCREENED=date(2024, 1, 15)),
    Row(SUBJID='2', AGE=56, SEX='M',  HEIGHT=168, SCREENED=date(2024, 2, 10)),
    Row(SUBJID='3', AGE=44, SEX=None, HEIGHT=170, SCREENED=date(2024, 1, 17))
])

df

#### Add constant values as a new variable

In [0]:
df = df.withColumn(
        "STUDYID",            # > The name of the new variable
        F.lit('JnJ_STUDY_A')  # > The constant value we assign
)

# We can chain these lazy operations. The use of \ is a way to break the line, for better readibility
df = df.\
    withColumn("Missings", F.lit(None)).\
    withColumn("WEIGHT",   F.lit(50)
)

df.display()

#### Using another column from the dataframe

In [0]:
# Using another column in the dataframe. 

# Create BMI and round it:
df = df.withColumn(
    'BMI',
    F.round(df.WEIGHT / (df['HEIGHT']/100)**2, 1) # Note we can use 2 different syntax to access columns
)

# Concatenate variables:
df = df.withColumn(
    'A/S',
    F.concat_ws('/', 'AGE', 'SEX')
)

df.display()

#### Add a column based on a condition

In [0]:
from pyspark.sql.functions import when

df = df.withColumn("SEXL",
     when(df.SEX == 'F', F.lit("Female")) \
    .when(df.SEX == 'M', F.lit("Male")) \
    .otherwise(F.lit(None))
  )
  
df.display()

In [0]:
# Alternatively in that easy scenario, we could use a mapping dictionnary and a User Defined Function
from pyspark.sql.types import StringType

SEXL = {'F': 'Female', 'M': 'Male'}

SEXL_udf = F.udf(lambda x: SEXL.get(x), StringType())
df = df.withColumn('SEXL', SEXL_udf(F.col('SEX')))

df.display()