In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()

from google.colab import drive

# Mount Google Drive with a longer timeout
drive.mount('/content/drive', force_remount=True, timeout_ms=300000)

Mounted at /content/drive


In [3]:
df_employee_data = "/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv"
employeeSechema = StructType([
    StructField("ID",IntegerType() ,True),
    StructField("Name",StringType() ,True),
    StructField("Age",IntegerType() ,True),
    StructField("Salary",FloatType() ,True),
    StructField("Joining_Date",DateType() ,True),
    StructField("Department",StringType() ,True),
    StructField("Performance_Rating",IntegerType() ,True),
    StructField("Email",StringType() ,True),
    StructField("Address",StringType() ,True),
    StructField("Phone",StringType() ,True)

])
# Load the DataFrame with the defined schema
#df = spark.read.csv(path=df_employee_data, header=True, schema=employeeSechema)
df = spark.read.load(path="/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv", format="csv", header = True, schema=employeeSechema)
df.printSchema()
df.show()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: float (nullable = true)
 |-- Joining_Date: date (nullable = true)
 |-- Department: string (nullable = true)
 |-- Performance_Rating: integer (nullable = true)
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Phone: string (nullable = true)

+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+------------+
| ID|         Name|Age| Salary|Joining_Date| Department|Performance_Rating|               Email|           Address|       Phone|
+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+------------+
|  1|Alice Johnson| 29|75000.0|  2021-03-15|Engineering|                 4|alice.johnson@exa...|123 Elm Street, NY|123-456-7890|
|  2|    Bob Smith| 35|85000.0|  2020-08-20|  Marketing|                 5|bob.smith@e

# PySpark DataFrame Manipulation part 2: Adding, Renaming, and Dropping Columns

### 1. Adding New Columns with withColumn()

In PySpark, the withColumn() function is widely used to add new columns to a DataFrame. You can either assign a constant value using lit() or perform transformations using existing columns.

In [10]:
#Add column with constant value
newDef1 = df.withColumn('NewColumn', lit(1))
newDef1.show()


+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+------------+---------+
| ID|         Name|Age| Salary|Joining_Date| Department|Performance_Rating|               Email|           Address|       Phone|NewColumn|
+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+------------+---------+
|  1|Alice Johnson| 29|75000.0|  2021-03-15|Engineering|                 4|alice.johnson@exa...|123 Elm Street, NY|123-456-7890|        1|
|  2|    Bob Smith| 35|85000.0|  2020-08-20|  Marketing|                 5|bob.smith@example...|456 Oak Avenue, LA|987-654-3210|        1|
|  3|  Cathy Brown| 42|95000.0|  2019-11-10|    Finance|                 3|cathy.brown@examp...| 789 Pine Road, TX|567-890-1234|        1|
|  4|  David White| 30|67000.0|  2022-01-05|         HR|                 4|david.white@examp...|234 Maple Lane, IL|345-678-9012|        1|
|  5|    Eva Green| 28|7200

In [11]:
#Add column based on a expression

newDef2 = df.withColumn("withDepartment", expr("Department == 'Sales'"))
newDef2.show()

+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+------------+--------------+
| ID|         Name|Age| Salary|Joining_Date| Department|Performance_Rating|               Email|           Address|       Phone|withDepartment|
+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+------------+--------------+
|  1|Alice Johnson| 29|75000.0|  2021-03-15|Engineering|                 4|alice.johnson@exa...|123 Elm Street, NY|123-456-7890|         false|
|  2|    Bob Smith| 35|85000.0|  2020-08-20|  Marketing|                 5|bob.smith@example...|456 Oak Avenue, LA|987-654-3210|         false|
|  3|  Cathy Brown| 42|95000.0|  2019-11-10|    Finance|                 3|cathy.brown@examp...| 789 Pine Road, TX|567-890-1234|         false|
|  4|  David White| 30|67000.0|  2022-01-05|         HR|                 4|david.white@examp...|234 Maple Lane, IL|345-678-9012|        

### 2. Renaming Columns with withColumnRenamed()

In [12]:
newDf3 = df.withColumnRenamed("Performance_Rating", 'Performance')
newDf3.show()

+---+-------------+---+-------+------------+-----------+-----------+--------------------+------------------+------------+
| ID|         Name|Age| Salary|Joining_Date| Department|Performance|               Email|           Address|       Phone|
+---+-------------+---+-------+------------+-----------+-----------+--------------------+------------------+------------+
|  1|Alice Johnson| 29|75000.0|  2021-03-15|Engineering|          4|alice.johnson@exa...|123 Elm Street, NY|123-456-7890|
|  2|    Bob Smith| 35|85000.0|  2020-08-20|  Marketing|          5|bob.smith@example...|456 Oak Avenue, LA|987-654-3210|
|  3|  Cathy Brown| 42|95000.0|  2019-11-10|    Finance|          3|cathy.brown@examp...| 789 Pine Road, TX|567-890-1234|
|  4|  David White| 30|67000.0|  2022-01-05|         HR|          4|david.white@examp...|234 Maple Lane, IL|345-678-9012|
|  5|    Eva Green| 28|72000.0|  2023-06-22|Engineering|          5|eva.green@example...|321 Birch Blvd, FL|234-567-8901|
+---+-------------+---+-

### 3. Dropping Columns with drop()

In [17]:
newDef2.show()

#Drop a single column:
newDef2_1= newDef2.drop('withDepartment')
newDef2_1.show()

#Drop a multiple column:
newDef2_2 = newDef2_1.drop('phone', 'address')
newDef2_2.show()

+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+------------+--------------+
| ID|         Name|Age| Salary|Joining_Date| Department|Performance_Rating|               Email|           Address|       Phone|withDepartment|
+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+------------+--------------+
|  1|Alice Johnson| 29|75000.0|  2021-03-15|Engineering|                 4|alice.johnson@exa...|123 Elm Street, NY|123-456-7890|         false|
|  2|    Bob Smith| 35|85000.0|  2020-08-20|  Marketing|                 5|bob.smith@example...|456 Oak Avenue, LA|987-654-3210|         false|
|  3|  Cathy Brown| 42|95000.0|  2019-11-10|    Finance|                 3|cathy.brown@examp...| 789 Pine Road, TX|567-890-1234|         false|
|  4|  David White| 30|67000.0|  2022-01-05|         HR|                 4|david.white@examp...|234 Maple Lane, IL|345-678-9012|        