In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()

from google.colab import drive

# Mount Google Drive with a longer timeout
drive.mount('/content/drive', force_remount=True, timeout_ms=300000)


Mounted at /content/drive


In [4]:
df_employee_data = "/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv"
employeeSechema = StructType([
    StructField("ID",IntegerType() ,True),
    StructField("Name",StringType() ,True),
    StructField("Age",IntegerType() ,True),
    StructField("Salary",FloatType() ,True),
    StructField("Joining_Date",DateType() ,True),
    StructField("Department",StringType() ,True),
    StructField("Performance_Rating",IntegerType() ,True),
    StructField("Email",StringType() ,True),
    StructField("Address",StringType() ,True),
    StructField("Phone",StringType() ,True)

])
# Load the DataFrame with the defined schema
#df = spark.read.csv(path=df_employee_data, header=True, schema=employeeSechema)
df = spark.read.load(path="/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv", format="csv", header = True, schema=employeeSechema)
df.printSchema()
df.show()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: float (nullable = true)
 |-- Joining_Date: date (nullable = true)
 |-- Department: string (nullable = true)
 |-- Performance_Rating: integer (nullable = true)
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Phone: string (nullable = true)

+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+------------+
| ID|         Name|Age| Salary|Joining_Date| Department|Performance_Rating|               Email|           Address|       Phone|
+---+-------------+---+-------+------------+-----------+------------------+--------------------+------------------+------------+
|  1|Alice Johnson| 29|75000.0|  2021-03-15|Engineering|                 4|alice.johnson@exa...|123 Elm Street, NY|123-456-7890|
|  2|    Bob Smith| 35|85000.0|  2020-08-20|  Marketing|                 5|bob.smith@e

###  1. Different Methods to Select Columns
In PySpark, you can select specific columns in multiple ways:

*   Using col() function/ column() / string way




In [5]:
# Using col() function
df.select(col("Name")).show()
# Using column() function
df.select(column("Age")).show()
# Directly using string name
df.select("Salary").show()

+-------------+
|         Name|
+-------------+
|Alice Johnson|
|    Bob Smith|
|  Cathy Brown|
|  David White|
|    Eva Green|
+-------------+

+---+
|Age|
+---+
| 29|
| 35|
| 42|
| 30|
| 28|
+---+

+-------+
| Salary|
+-------+
|75000.0|
|85000.0|
|95000.0|
|67000.0|
|72000.0|
+-------+



### 2. Selecting Multiple Columns Together
Combine different methods to select multiple columns:

In [8]:
# Multiple column
df2 = df.select("Name", "Age", col("Salary"), column("Department"), df.Phone)
df2.show()

+-------------+---+-------+-----------+------------+
|         Name|Age| Salary| Department|       Phone|
+-------------+---+-------+-----------+------------+
|Alice Johnson| 29|75000.0|Engineering|123-456-7890|
|    Bob Smith| 35|85000.0|  Marketing|987-654-3210|
|  Cathy Brown| 42|95000.0|    Finance|567-890-1234|
|  David White| 30|67000.0|         HR|345-678-9012|
|    Eva Green| 28|72000.0|Engineering|234-567-8901|
+-------------+---+-------+-----------+------------+



### 3. Listing All Columns in a DataFrame
To get a list of all the column names:





In [None]:
#Get all column name
df.columns


