- Name: 04.0_dataframe_Operations
- Author: Shamas Imran
- Desciption: Exploring DataFrames functions and operations
- Date: 04-Oct-2025

In [1]:
from pyspark.sql.types import *

studentFilePath =  "Files/client_output_data/parquet/student"

student_schema = StructType([
    StructField('StudentID', IntegerType(), False),
    StructField('StudentName', StringType(), True),
    StructField('StudentAge', IntegerType(), True)
])

student_data = [
        (1, "Alice", 34), 
        (2, "Bob", 45), 
        (3, "Charlie", 29),
        (4, "Shamas", 40)
        ]

df_student = spark.createDataFrame(student_data, student_schema)
df_student.show()

df_student.write.mode("overwrite").parquet(studentFilePath)
print("Parquet file written to " + studentFilePath)

StatementMeta(, c96b76f3-c035-4ba7-82ea-c62cfc47c112, 3, Finished, Available, Finished)

+---------+-----------+----------+
|StudentID|StudentName|StudentAge|
+---------+-----------+----------+
|        1|      Alice|        34|
|        2|        Bob|        45|
|        3|    Charlie|        29|
|        4|     Shamas|        40|
+---------+-----------+----------+

Parquet file written to Files/client_output_data/parquet/student


In [3]:
spark.read.parquet(studentFilePath).show()

StatementMeta(, c96b76f3-c035-4ba7-82ea-c62cfc47c112, 5, Finished, Available, Finished)

+---------+-----------+----------+
|StudentID|StudentName|StudentAge|
+---------+-----------+----------+
|        3|    Charlie|        29|
|        1|     Shamas|        40|
|        4|      Alice|        34|
|        2|        Bob|        45|
+---------+-----------+----------+



In [2]:
from pyspark.sql.types import *

student_schema = StructType([
    StructField('StudentID', IntegerType(), False),
    StructField('StudentName', StringType(), True),
    StructField('StudentAge', IntegerType(), True)
])

student_data = [
        (4, "Alice", 34), 
        (2, "Bob", 45), 
        (3, "Charlie", 29),
        (1, "Shamas", 40)
        ]

df_student = spark.createDataFrame(student_data, student_schema)

# Overwrite existing data
df_student.write.mode("overwrite").parquet(studentFilePath)
print("Parquet file written to " + studentFilePath)

StatementMeta(, c96b76f3-c035-4ba7-82ea-c62cfc47c112, 4, Finished, Available, Finished)

Parquet file written to Files/client_output_data/parquet/student


In [4]:
import pandas as pd
from pyspark.sql.types import *

student_schema = StructType([
    StructField('StudentID', IntegerType(), False),
    StructField('StudentName', StringType(), True),
    StructField('StudentAge', IntegerType(), True)
])

student_data = [
        (4, "Alice", 34), 
        (2, "Bob", 45), 
        (3, "Charlie", 29),
        (1, "Shamas", 40),
        
        (5, "Ali", 50),
        (6, "Ali", 50),
        (7, "Bob", 45),
        ]

df_student = spark.createDataFrame(student_data, student_schema)
df_student.write.mode("append").parquet(studentFilePath)

spark.read.parquet(studentFilePath).show()

StatementMeta(, c96b76f3-c035-4ba7-82ea-c62cfc47c112, 6, Finished, Available, Finished)

+---------+-----------+----------+
|StudentID|StudentName|StudentAge|
+---------+-----------+----------+
|        3|    Charlie|        29|
|        3|    Charlie|        29|
|        1|     Shamas|        40|
|        1|     Shamas|        40|
|        4|      Alice|        34|
|        4|      Alice|        34|
|        2|        Bob|        45|
|        2|        Bob|        45|
|        5|        Ali|        50|
|        6|        Ali|        50|
|        7|        Bob|        45|
+---------+-----------+----------+



### 💾 Spark DataFrame Write Modes in Fabric Notebooks

When writing DataFrames to a table, file, or path in Fabric Lakehouse or Delta, you can use the `.mode()` option to control the write behavior.

| Mode | Description | Use Case Example |
|------|--------------|------------------|
| **append** | Adds new records to the existing data without deleting previous content. | Adding daily or incremental records to a Delta table. |
| **overwrite** | Replaces existing data completely (drops existing files and writes new ones). | Full refresh or reprocessing scenarios. |
| **ignore** | If data already exists at the path or table, the write operation is skipped (no error thrown). | Avoiding accidental overwrites during reruns. |
| **error** or **errorifexists** *(default)* | Throws an error if data already exists at the destination. | Ensuring no duplicate writes occur. |

In [6]:
df_studentsubset = df_student.select(df_student.StudentID, df_student.StudentName, df_student.StudentAge.alias("Age"))
df_studentsubset.show()

StatementMeta(, c96b76f3-c035-4ba7-82ea-c62cfc47c112, 8, Finished, Available, Finished)

+---------+-----------+---+
|StudentID|StudentName|Age|
+---------+-----------+---+
|        4|      Alice| 34|
|        2|        Bob| 45|
|        3|    Charlie| 29|
|        1|     Shamas| 40|
|        5|        Ali| 50|
|        6|        Ali| 50|
|        7|        Bob| 45|
+---------+-----------+---+



In [7]:
df_studentsubset.filter(df_studentsubset.Age > 34).show() # Filter rows
#where

StatementMeta(, c96b76f3-c035-4ba7-82ea-c62cfc47c112, 9, Finished, Available, Finished)

+---------+-----------+---+
|StudentID|StudentName|Age|
+---------+-----------+---+
|        2|        Bob| 45|
|        1|     Shamas| 40|
|        5|        Ali| 50|
|        6|        Ali| 50|
|        7|        Bob| 45|
+---------+-----------+---+



In [10]:
from pyspark.sql.functions import monotonically_increasing_id

df_studentsubset_renamed = df_studentsubset.withColumnRenamed("Age", "SolarAge")

df_studentFinal = (
    df_studentsubset_renamed
    .withColumn(
        "LunarAge",
        (
            (
                (df_studentsubset_renamed.SolarAge * 365) +
                (df_studentsubset_renamed.SolarAge * 11)
            ) / 365
        ).cast("int")
    )
)

df_studentFinal = df_studentFinal.withColumn("row_id", monotonically_increasing_id())

display(df_studentFinal)

StatementMeta(, c96b76f3-c035-4ba7-82ea-c62cfc47c112, 12, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a8d46427-68dd-4059-8b49-6cd1b4b80268)

In [11]:
from pyspark.sql.functions import col

# Sort by Age (ascending - default)
df_studentFinal.orderBy(col("StudentID")).show() # or desc()
df_studentFinal.orderBy(col("StudentID").desc()).show()

# Sort by StudentID
df_studentFinal.sort(col("StudentID").asc()).show()

StatementMeta(, c96b76f3-c035-4ba7-82ea-c62cfc47c112, 13, Finished, Available, Finished)

+---------+-----------+--------+--------+-----------+
|StudentID|StudentName|SolarAge|LunarAge|     row_id|
+---------+-----------+--------+--------+-----------+
|        1|     Shamas|      40|      41|34359738368|
|        2|        Bob|      45|      46|17179869184|
|        3|    Charlie|      29|      29|25769803776|
|        4|      Alice|      34|      35| 8589934592|
|        5|        Ali|      50|      51|42949672960|
|        6|        Ali|      50|      51|51539607552|
|        7|        Bob|      45|      46|60129542144|
+---------+-----------+--------+--------+-----------+

+---------+-----------+--------+--------+-----------+
|StudentID|StudentName|SolarAge|LunarAge|     row_id|
+---------+-----------+--------+--------+-----------+
|        7|        Bob|      45|      46|60129542144|
|        6|        Ali|      50|      51|51539607552|
|        5|        Ali|      50|      51|42949672960|
|        4|      Alice|      34|      35| 8589934592|
|        3|    Charlie|    

In [12]:
df_studentFinal = df_studentFinal.drop("SolarAge")
display(df_studentFinal)

StatementMeta(, c96b76f3-c035-4ba7-82ea-c62cfc47c112, 14, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 6beb3cfb-4292-43a7-b85b-06e8317ba388)

In [14]:
display(df_studentFinal)
df_studentFinal.show()  
df_studentFinal.printSchema()      # See schema
print(df_studentFinal.schema)           # Show rows
print(df_studentFinal.count())            # Count total rows
print(df_studentFinal.columns)            # List of columns
df_studentFinal.describe().show()  # Summary stats (min, max, avg, stddev)

StatementMeta(, c96b76f3-c035-4ba7-82ea-c62cfc47c112, 16, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 4dad809c-2f2b-443f-af2f-34b1e611bc09)

+---------+-----------+--------+-----------+
|StudentID|StudentName|LunarAge|     row_id|
+---------+-----------+--------+-----------+
|        4|      Alice|      35| 8589934592|
|        2|        Bob|      46|17179869184|
|        3|    Charlie|      29|25769803776|
|        1|     Shamas|      41|34359738368|
|        5|        Ali|      51|42949672960|
|        6|        Ali|      51|51539607552|
|        7|        Bob|      46|60129542144|
+---------+-----------+--------+-----------+

root
 |-- StudentID: integer (nullable = false)
 |-- StudentName: string (nullable = true)
 |-- LunarAge: integer (nullable = true)
 |-- row_id: long (nullable = false)

StructType([StructField('StudentID', IntegerType(), False), StructField('StudentName', StringType(), True), StructField('LunarAge', IntegerType(), True), StructField('row_id', LongType(), False)])
7
['StudentID', 'StudentName', 'LunarAge', 'row_id']
+-------+-----------------+-----------+------------------+--------------------+
|sum