
# PySpark DataFrame API Examples

This notebook demonstrates various **PySpark DataFrame operations** 

---


In [None]:

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("PySparkDataFrameExamples").getOrCreate()


### Create a DataFrame from a Python Collection

In [None]:

data = [("Ravi", 25, "Bengaluru"),
        ("Priya", 30, "Hyderabad"),
        ("Ankit", 28, "Pune"),
        ("Lakshmi", 32, "Chennai")]

print(data)
print(type(data))


In [None]:
columns = ["Name", "Age", "City"]

print(columns)
print(type(columns))


In [None]:
df = spark.createDataFrame(data, columns)
df.show()

### Selecting Columns

In [None]:

df.select("Name", "Age").show()


### Filtering Rows

In [None]:

df.filter(df.Age > 28).show()


### Add or Transform Columns

In [None]:

df_new = df.withColumn("Age_After_5_Years", df.Age + 5)
df_new.show()


### Rename Columns

In [None]:

df_renamed = df.withColumnRenamed("City", "Location")
df_renamed.show()


### Drop Columns

In [None]:

df_dropped = df.drop("City")
df_dropped.show()


### Sorting Data

In [None]:

df.orderBy(F.desc("Age")).show()


### Conditional Column with `when` and `otherwise`

In [None]:

df_cond = df.withColumn("Category", F.when(df.Age > 30, "Senior").otherwise("Junior"))
df_cond.show()


### DataFrame Summary and Statistics

In [None]:

df.describe().show()
df.summary().show()


### Group By and Aggregations

In [None]:

group_data = [("IT", 40000), ("HR", 25000), ("IT", 45000), ("Finance", 30000)]
df_group = spark.createDataFrame(group_data, ["Department", "Salary"])




In [None]:
df_group.show()

In [None]:
df_group.groupBy("Department").agg(
    F.avg("Salary").alias("Avg_Salary")
).show()

In [None]:
df_group.groupBy("Department").agg(
    F.avg("Salary").alias("Avg_Salary"),
    F.max("Salary").alias("Max_Salary"),
    F.max("Salary").alias("Min_Salary"),
    F.count("*").alias("Count")
).show()

### Removing Duplicates

In [None]:
group_data = [("IT", 40000), ("HR", 25000), ("IT", 45000), ("Finance", 30000), ("IT", 45000), ("Finance", 30000)]
df_group = spark.createDataFrame(group_data, ["Department", "Salary"])

In [None]:
df_group.dropDuplicates().show()


### Joins Between DataFrames

In [None]:

dept_data = [("Ravi", "IT"), ("Priya", "HR"), ("Ankit", "Finance")]
df_dept = spark.createDataFrame(dept_data, ["Name", "Department"])

joined = df.join(df_dept, on="Name", how="inner")
joined.show()


### Handling Null Values

In [None]:

null_data = [("Ravi", None), ("Priya", 30), ("Ankit", None)]
df_null = spark.createDataFrame(null_data, ["Name", "Age"])

df_null.fillna({"Age": 25}).show()   # Replace nulls
df_null.na.drop().show()              # Drop nulls


### Repartition and Coalesce

In [None]:

print("Repartition to 4 partitions:")
df_repart = df.repartition(4)
print(df_repart.rdd.getNumPartitions())

print("Coalesce back to 2 partitions:")
df_coalesce = df_repart.coalesce(2)
print(df_coalesce.rdd.getNumPartitions())



---

### âœ… Summary of APIs Covered

| Operation | API | Description |
|------------|-----|-------------|
| Create DataFrame | `createDataFrame()` | From list/collection |
| Select Columns | `select()` | Choose specific columns |
| Filter Rows | `filter()` | Apply condition filters |
| Add Column | `withColumn()` | Add or modify a column |
| Rename Column | `withColumnRenamed()` | Rename existing column |
| Drop Column | `drop()` | Remove a column |
| Group & Aggregate | `groupBy().agg()` | Aggregate functions |
| Sort | `orderBy()` | Ascending/Descending |
| Remove Duplicates | `dropDuplicates()` | Keep unique rows |
| Join | `join()` | Combine multiple DataFrames |
| Handle Nulls | `fillna()`, `na.drop()` | Manage missing values |
| Conditional Logic | `when().otherwise()` | Add derived columns |
| Statistics | `describe()`, `summary()` | Summary metrics |
| Partition Control | `repartition()`, `coalesce()` | Manage partitions |

---

