# SELECT, DROP, RENAME COLUMNS IN PYSPAARK

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Example").getOrCreate()
data = [(1, "Alice", 29), (2, "Bob", 35)]
df = spark.createDataFrame(data, ["id", "name", "age"])
df.show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 29|
|  2|  Bob| 35|
+---+-----+---+



In [2]:
# select single column
df = df.select("name")

In [4]:
spark = SparkSession.builder.appName("Example").getOrCreate()
data = [
{"id": 1, "name": "Alice", "age": 29},
{"id": 2, "name": "Bob", "age": 35}
]
df = spark.createDataFrame(data)
df.show()

+---+---+-----+
|age| id| name|
+---+---+-----+
| 29|  1|Alice|
| 35|  2|  Bob|
+---+---+-----+



In [5]:
# select multiple columns
df = df.select("name", "age")

In [7]:
# select columns dynamically
columns_to_select = ["name", "age"]
df = df.select(*columns_to_select)

# Renaming Columns

In [8]:
# rename a column
df = df.withColumnRenamed("name","full_name")

In [10]:
# rename multiple columns with chained calls
df = df.withColumnRenamed("old_col1", "new_col1").withColumnRenamed("old_col2", "new_col2")

In [12]:
# rename columns using select and alias

from pyspark.sql.functions import col

df = df.select(col("full_name").alias("first_name"), col("age").alias("person_age"))

# ADDING COLUMNS

In [13]:
from pyspark.sql.functions import col, lit, expr, when

# add a new column with a constant value
df = df.withColumn("country", lit("USA"))

In [15]:
# add a new column with a calculated value
df = df.withColumn("salary_after_bonus", col("person_age")*1.1)

In [17]:
# add a column using an SQL expression
df = df.withColumn("tax", expr("salary_after_bonus*0.2"))

In [19]:
# add a column with conditional logic
df = df.withColumn("high_earner", when(col("salary_after_bonus")>55000, "yes").otherwise("no"))

In [21]:
# case when with multiple conditional logic
df = df.withColumn("salary_category",
                   when(col("salary_after_bonus") < 60000, "low")
                   .when((col("salary_after_bonus") >= 60000) & (col("salary_after_bonus") < 90000), "medium")
                   .otherwise("high"))

In [23]:
# add multiple columns
df = df.withColumns({
    "bonus": col("salary_after_bonus") * 0.1,
    "net_salary": col("salary_after_bonus") - (col("salary_after_bonus") * 0.2)
})

# DROPPING COLUMNS

In [24]:
# Drop a column
df = df.drop("department")

In [25]:
# Drop multiple columns
df = df.drop('column1', 'column2', 'column3')