In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

data = [
    (1, "Alice", "HR", "2024-01-10", None, ["Python", "SQL"]),
    (2, "Bob", None, "2024-02-15", 5000, ["Java"]),
    (3, "Charlie", "IT", None, 7000, None)
]

columns = ["id", "name", "dept", "join_date", "salary", "skills"]

df = spark.createDataFrame(data, columns)
df.show(truncate=False)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/15 16:18:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/15 16:18:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

+---+-------+----+----------+------+-------------+
|id |name   |dept|join_date |salary|skills       |
+---+-------+----+----------+------+-------------+
|1  |Alice  |HR  |2024-01-10|NULL  |[Python, SQL]|
|2  |Bob    |NULL|2024-02-15|5000  |[Java]       |
|3  |Charlie|IT  |NULL      |7000  |NULL         |
+---+-------+----+----------+------+-------------+



select


Used to pick specific columns from a DataFrame.

In [2]:
df.select("name", "dept").show()

+-------+----+
|   name|dept|
+-------+----+
|  Alice|  HR|
|    Bob|NULL|
|Charlie|  IT|
+-------+----+



In [3]:
df.select("*").show()


+---+-------+----+----------+------+-------------+
| id|   name|dept| join_date|salary|       skills|
+---+-------+----+----------+------+-------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|
|  2|    Bob|NULL|2024-02-15|  5000|       [Java]|
|  3|Charlie|  IT|      NULL|  7000|         NULL|
+---+-------+----+----------+------+-------------+



In [4]:
df.show()


+---+-------+----+----------+------+-------------+
| id|   name|dept| join_date|salary|       skills|
+---+-------+----+----------+------+-------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|
|  2|    Bob|NULL|2024-02-15|  5000|       [Java]|
|  3|Charlie|  IT|      NULL|  7000|         NULL|
+---+-------+----+----------+------+-------------+



In [5]:
df.select(df.columns).show()

+---+-------+----+----------+------+-------------+
| id|   name|dept| join_date|salary|       skills|
+---+-------+----+----------+------+-------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|
|  2|    Bob|NULL|2024-02-15|  5000|       [Java]|
|  3|Charlie|  IT|      NULL|  7000|         NULL|
+---+-------+----+----------+------+-------------+



withColumn


Used to add a new column or modify an existing column.

In [6]:
df.withColumn("salary_bonus",col("salary")+1000).show()

+---+-------+----+----------+------+-------------+------------+
| id|   name|dept| join_date|salary|       skills|salary_bonus|
+---+-------+----+----------+------+-------------+------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|        NULL|
|  2|    Bob|NULL|2024-02-15|  5000|       [Java]|        6000|
|  3|Charlie|  IT|      NULL|  7000|         NULL|        8000|
+---+-------+----+----------+------+-------------+------------+



drop

Used to remove a column.

In [7]:
df.drop("skills").show()

+---+-------+----+----------+------+
| id|   name|dept| join_date|salary|
+---+-------+----+----------+------+
|  1|  Alice|  HR|2024-01-10|  NULL|
|  2|    Bob|NULL|2024-02-15|  5000|
|  3|Charlie|  IT|      NULL|  7000|
+---+-------+----+----------+------+



alias


Used to rename a column temporarily (mostly in select).

In [9]:
df.select(col("name").alias("employee_name")).show()

+-------------+
|employee_name|
+-------------+
|        Alice|
|          Bob|
|      Charlie|
+-------------+



cast


Used to change the data type of a column.

In [10]:
df.withColumn("Salary_int",col("salary").cast("int")).show()

+---+-------+----+----------+------+-------------+----------+
| id|   name|dept| join_date|salary|       skills|Salary_int|
+---+-------+----+----------+------+-------------+----------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|      NULL|
|  2|    Bob|NULL|2024-02-15|  5000|       [Java]|      5000|
|  3|Charlie|  IT|      NULL|  7000|         NULL|      7000|
+---+-------+----+----------+------+-------------+----------+



when (Conditional Logic)


Works like IF‚ÄìELSE condition.

In [11]:
df.withColumn(
    "salary_level",
    when(col("salary")>6000, "high")
    .when(col("salary")<=6000, "Medium")
    .otherwise("Unknown")
).show()

+---+-------+----+----------+------+-------------+------------+
| id|   name|dept| join_date|salary|       skills|salary_level|
+---+-------+----+----------+------+-------------+------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|     Unknown|
|  2|    Bob|NULL|2024-02-15|  5000|       [Java]|      Medium|
|  3|Charlie|  IT|      NULL|  7000|         NULL|        high|
+---+-------+----+----------+------+-------------+------------+



explode

Used to break array values into multiple rows.

In [15]:
df.select("name", explode("skills").alias("skill")).show()

+-----+------+
| name| skill|
+-----+------+
|Alice|Python|
|Alice|   SQL|
|  Bob|  Java|
+-----+------+



coalesce


Used to replace NULL values with another value.

In [16]:
df.show()

+---+-------+----+----------+------+-------------+
| id|   name|dept| join_date|salary|       skills|
+---+-------+----+----------+------+-------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|
|  2|    Bob|NULL|2024-02-15|  5000|       [Java]|
|  3|Charlie|  IT|      NULL|  7000|         NULL|
+---+-------+----+----------+------+-------------+



In [17]:
df.withColumn("salary",coalesce(col("salary"),lit("Unknown"))).show()

+---+-------+----+----------+-------+-------------+
| id|   name|dept| join_date| salary|       skills|
+---+-------+----+----------+-------+-------------+
|  1|  Alice|  HR|2024-01-10|Unknown|[Python, SQL]|
|  2|    Bob|NULL|2024-02-15|   5000|       [Java]|
|  3|Charlie|  IT|      NULL|   7000|         NULL|
+---+-------+----+----------+-------+-------------+



Date Functions

Common date functions in PySpark:

Convert string ‚Üí date

In [18]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- join_date: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [19]:
df.withColumn("join_date",to_date("join_date")).show()

+---+-------+----+----------+------+-------------+
| id|   name|dept| join_date|salary|       skills|
+---+-------+----+----------+------+-------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|
|  2|    Bob|NULL|2024-02-15|  5000|       [Java]|
|  3|Charlie|  IT|      NULL|  7000|         NULL|
+---+-------+----+----------+------+-------------+



In [20]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- join_date: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



## üî¥ Why `join_date` is STILL `string` after `withColumn(...)`?

### Your code:

```python
df.withColumn("join_date", to_date("join_date")).show()
df.printSchema()
```

### ‚ùó Key Reason

üëâ **PySpark DataFrames are IMMUTABLE**

That means:

> Any transformation (`withColumn`, `select`, `drop`, etc.)
> **does NOT change `df` unless you assign it back**

---

## üß† What actually happened?

```python
df.withColumn("join_date", to_date("join_date")).show()
```

‚úîÔ∏è Spark created a **new temporary DataFrame**
‚úîÔ∏è Converted `join_date` to `date`
‚úîÔ∏è Displayed it using `show()`
‚ùå **Original `df` was NOT modified**

So when you run:

```python
df.printSchema()
```

You are still checking the **old DataFrame**, where `join_date` is a string.

---

## ‚úÖ Correct Way (Fix)

You must **reassign** the DataFrame:

```python
df = df.withColumn("join_date", to_date("join_date"))
df.printSchema()
```

### ‚úÖ Output will now be:

```
 |-- join_date: date (nullable = true)
```

‚úîÔ∏è Now the column is truly converted.

---

## üß™ Proof Example

### ‚ùå Without reassignment

```python
df.withColumn("join_date", to_date("join_date")).show()
df.printSchema()   # still string
```

### ‚úÖ With reassignment

```python
df = df.withColumn("join_date", to_date("join_date"))
df.printSchema()   # date
```

---

## üîπ Why PySpark works this way?

* DataFrames are **immutable** (like SQL tables)
* Prevents accidental data corruption
* Improves performance & fault tolerance

---

## üìù Rule to Remember (Interview Gold ‚≠ê)

> **In PySpark, every transformation returns a new DataFrame.
> If you don‚Äôt assign it, the change is lost.**

---

## üìå Quick Tip

Same rule applies to:

* `drop()`
* `select()`
* `withColumnRenamed()`
* `filter()`

In [21]:
df1= df.withColumn("join_date",to_date("join_date")).show()

+---+-------+----+----------+------+-------------+
| id|   name|dept| join_date|salary|       skills|
+---+-------+----+----------+------+-------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|
|  2|    Bob|NULL|2024-02-15|  5000|       [Java]|
|  3|Charlie|  IT|      NULL|  7000|         NULL|
+---+-------+----+----------+------+-------------+



In [22]:
df1.printSchema()

AttributeError: 'NoneType' object has no attribute 'printSchema'

In [23]:
df1= df.withColumn("join_date",to_date("join_date"))

In [24]:
df1.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- join_date: date (nullable = true)
 |-- salary: long (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



where 


Used to keep only the rows that match a condition

In [25]:
df.show()

+---+-------+----+----------+------+-------------+
| id|   name|dept| join_date|salary|       skills|
+---+-------+----+----------+------+-------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|
|  2|    Bob|NULL|2024-02-15|  5000|       [Java]|
|  3|Charlie|  IT|      NULL|  7000|         NULL|
+---+-------+----+----------+------+-------------+



In [26]:
df.where(df.salary>5000).show()

+---+-------+----+---------+------+------+
| id|   name|dept|join_date|salary|skills|
+---+-------+----+---------+------+------+
|  3|Charlie|  IT|     NULL|  7000|  NULL|
+---+-------+----+---------+------+------+



filter in PySpark


Does the same job as where ‚Äî filters rows based on condition.

In [27]:
df.show()

+---+-------+----+----------+------+-------------+
| id|   name|dept| join_date|salary|       skills|
+---+-------+----+----------+------+-------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|
|  2|    Bob|NULL|2024-02-15|  5000|       [Java]|
|  3|Charlie|  IT|      NULL|  7000|         NULL|
+---+-------+----+----------+------+-------------+



In [28]:
df.filter(df.dept=="HR").show()

+---+-----+----+----------+------+-------------+
| id| name|dept| join_date|salary|       skills|
+---+-----+----+----------+------+-------------+
|  1|Alice|  HR|2024-01-10|  NULL|[Python, SQL]|
+---+-----+----+----------+------+-------------+



Important Point

üëâ where and filter are identical in PySpark
Choose whichever you like ‚Äî no difference in result or performance.

Using AND condition


Filter rows when multiple conditions must be true.

In [31]:
df.filter((df.dept=="IT")& (df.salary>6000)).show()

+---+-------+----+---------+------+------+
| id|   name|dept|join_date|salary|skills|
+---+-------+----+---------+------+------+
|  3|Charlie|  IT|     NULL|  7000|  NULL|
+---+-------+----+---------+------+------+



Using OR condition

In [32]:
df.filter((df.dept=="IT") | (df.salary>5000)).show()

+---+-------+----+---------+------+------+
| id|   name|dept|join_date|salary|skills|
+---+-------+----+---------+------+------+
|  3|Charlie|  IT|     NULL|  7000|  NULL|
+---+-------+----+---------+------+------+



In [33]:
df.where ((df.dept=="IT") | (df.salary>5000)).show()

+---+-------+----+---------+------+------+
| id|   name|dept|join_date|salary|skills|
+---+-------+----+---------+------+------+
|  3|Charlie|  IT|     NULL|  7000|  NULL|
+---+-------+----+---------+------+------+



In [34]:
df.filter("salary">5000 and dept="IT").show()

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (935234003.py, line 1)

In [35]:
df.filter("salary">5000 and dept="IT"").show()

SyntaxError: EOL while scanning string literal (1595778094.py, line 1)

In [36]:
df.filter("salary>5000 and dept='IT'").show()

+---+-------+----+---------+------+------+
| id|   name|dept|join_date|salary|skills|
+---+-------+----+---------+------+------+
|  3|Charlie|  IT|     NULL|  7000|  NULL|
+---+-------+----+---------+------+------+



In [37]:

df.show()


+---+-------+----+----------+------+-------------+
| id|   name|dept| join_date|salary|       skills|
+---+-------+----+----------+------+-------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|
|  2|    Bob|NULL|2024-02-15|  5000|       [Java]|
|  3|Charlie|  IT|      NULL|  7000|         NULL|
+---+-------+----+----------+------+-------------+



IN


Check if a value matches any value from a given list.

In [38]:
df.filter(df.dept.isin("IT","HR")).show()

+---+-------+----+----------+------+-------------+
| id|   name|dept| join_date|salary|       skills|
+---+-------+----+----------+------+-------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|
|  3|Charlie|  IT|      NULL|  7000|         NULL|
+---+-------+----+----------+------+-------------+



NOT IN


Exclude rows that match values in a list.

In [40]:
df.filter(~df.dept.isin("IT","HR")).show()

+---+----+----+---------+------+------+
| id|name|dept|join_date|salary|skills|
+---+----+----+---------+------+------+
+---+----+----+---------+------+------+



BETWEEN


Check if a value falls within a range (start and end included).

In [41]:
df.filter(df.salary.between(4000,7000)).show()

+---+-------+----+----------+------+------+
| id|   name|dept| join_date|salary|skills|
+---+-------+----+----------+------+------+
|  2|    Bob|NULL|2024-02-15|  5000|[Java]|
|  3|Charlie|  IT|      NULL|  7000|  NULL|
+---+-------+----+----------+------+------+



NULL Filtering
Important Rule

‚ùå = or != does NOT work with NULL
‚úîÔ∏è Use isNull() / isNotNull()

In [42]:
df.filter(df.dept.isNull()).show()

+---+----+----+----------+------+------+
| id|name|dept| join_date|salary|skills|
+---+----+----+----------+------+------+
|  2| Bob|NULL|2024-02-15|  5000|[Java]|
+---+----+----+----------+------+------+



In [44]:
df.show()

+---+-------+----+----------+------+-------------+
| id|   name|dept| join_date|salary|       skills|
+---+-------+----+----------+------+-------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|
|  2|    Bob|NULL|2024-02-15|  5000|       [Java]|
|  3|Charlie|  IT|      NULL|  7000|         NULL|
+---+-------+----+----------+------+-------------+



NOT NULL values

In [43]:
df.filter(df.dept.isNotNull()).show()

+---+-------+----+----------+------+-------------+
| id|   name|dept| join_date|salary|       skills|
+---+-------+----+----------+------+-------------+
|  1|  Alice|  HR|2024-01-10|  NULL|[Python, SQL]|
|  3|Charlie|  IT|      NULL|  7000|         NULL|
+---+-------+----+----------+------+-------------+

