In [0]:
data = [
    (1, "Alice", 20),
    (2, "Bob", 17),
    (3, "Charlie", 23),
    (4, "David", 16)
]

columns = ["id", "name", "age"]

df = spark.createDataFrame(data, columns)
df.display()

id,name,age
1,Alice,20
2,Bob,17
3,Charlie,23
4,David,16


### select(): Used to select specific columns from a DataFrame. For example:

In [0]:
df.select("name").display()

name
Alice
Bob
Charlie
David


### selectExpr(): Allows selecting columns and performing transformations using SQL expressions. For example:

In [0]:
df.selectExpr("name", "age * 4 as age_next_year").display()

name,age_next_year
Alice,80
Bob,68
Charlie,92
David,64


### col(): Returns a Column object based on the given column name. Useful for referencing columns programmatically:

In [0]:
from pyspark.sql.functions import col

df.filter(col("age") > 18).display()

id,name,age
1,Alice,20
3,Charlie,23


### expr(): Parses a SQL expression string and returns it as a Column. It allows complex transformations and SQL-like operations:

In [0]:
from pyspark.sql.functions import expr

df.select("name", expr("age * 2 as age_double")).display()

name,age_double
Alice,40
Bob,34
Charlie,46
David,32


### when(): Allows conditional operations within DataFrame transformations, similar to SQL's CASE WHEN statement:

In [0]:
from pyspark.sql.functions import when

df.withColumn("status", when(col("age") >= 18, "adult").otherwise("minor")).display()

id,name,age,status
1,Alice,20,adult
2,Bob,17,minor
3,Charlie,23,adult
4,David,16,minor


### lit(): Creates a Column with a literal value, which can be useful for adding constant values or conditions:

In [0]:
from pyspark.sql.functions import lit

df.withColumn("country", lit("India")).display()

id,name,age,country
1,Alice,20,India
2,Bob,17,India
3,Charlie,23,India
4,David,16,India
