In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Ensure Spark uses correct Python
os.environ["PYSPARK_PYTHON"] = r"C:\Users\subba\anaconda3\envs\bigdataenv\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\subba\anaconda3\envs\bigdataenv\python.exe"

# Start Spark Session
spark = SparkSession.builder \
    .appName("FilterAndWithColumnExample") \
    .config("spark.driver.memory", "2g") \
    .config("spark.python.worker.reuse", "false") \
    .getOrCreate()

# Create sample DataFrame
data = [
    (1, "Alice", 23),
    (2, "Bob", 30),
    (3, "Charlie", 17),
    (4, "David", 40)
]
columns = ["id", "name", "age"]

df = spark.createDataFrame(data, columns)

print("✅ Original DataFrame:")
df.show()

# Apply filter: only people older than 20
filtered_df = df.filter(col("age") > 20)

print("✅ Filtered DataFrame (age > 20):")
filtered_df.show()

# Apply withColumn: add a new column 'isAdult'
# If age >= 18 → 'Yes', else 'No'
transformed_df = df.withColumn("isAdult", when(col("age") >= 18, "Yes").otherwise("No"))

print("✅ Transformed DataFrame with new column 'isAdult':")
transformed_df.show()

spark.stop()


✅ Original DataFrame:
+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 23|
|  2|    Bob| 30|
|  3|Charlie| 17|
|  4|  David| 40|
+---+-------+---+

✅ Filtered DataFrame (age > 20):
+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 23|
|  2|  Bob| 30|
|  4|David| 40|
+---+-----+---+

✅ Transformed DataFrame with new column 'isAdult':
+---+-------+---+-------+
| id|   name|age|isAdult|
+---+-------+---+-------+
|  1|  Alice| 23|    Yes|
|  2|    Bob| 30|    Yes|
|  3|Charlie| 17|     No|
|  4|  David| 40|    Yes|
+---+-------+---+-------+

