# A single PySpark script with different functions for:

- Reading data (CSV, Parquet)

- Transformation (filter, join, groupBy, agg)

- Handling nulls

- Partitioning & Repartitioning

- Using UDF

- Writing data

- Explaining optimization concepts

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.window import Window 

In [2]:
## Create Spark Session
spark = SparkSession.builder.appName('Interview_Askable_Functions').getOrCreate()

In [4]:
## Sample Data
data = [
    (1, "Alice", "HR", 5000, None),
    (2, "Bob", "IT", 7000, "2024-01-01"),
    (3, "Charlie", "Finance", 6000, "2024-01-05"),
    (4, "David", "IT", None, "2024-02-01"),
    (5, "Eva", "Finance", 7500, None)
]

columns = ["id", "name", "department", "salary", "join_date"]

df = spark.createDataFrame(data, columns)

df.show()

+---+-------+----------+------+----------+
| id|   name|department|salary| join_date|
+---+-------+----------+------+----------+
|  1|  Alice|        HR|  5000|      NULL|
|  2|    Bob|        IT|  7000|2024-01-01|
|  3|Charlie|   Finance|  6000|2024-01-05|
|  4|  David|        IT|  NULL|2024-02-01|
|  5|    Eva|   Finance|  7500|      NULL|
+---+-------+----------+------+----------+



In [10]:
# 1. Filter employees with salary > 6000
high_salary_df = df.filter(col('salary')> 6000)
high_salary_df.show()

def filter_high_salary(df):
    return df.filter(col('salary') > 6000)

filter_high_salary(df).show()


+---+----+----------+------+----------+
| id|name|department|salary| join_date|
+---+----+----------+------+----------+
|  2| Bob|        IT|  7000|2024-01-01|
|  5| Eva|   Finance|  7500|      NULL|
+---+----+----------+------+----------+

+---+----+----------+------+----------+
| id|name|department|salary| join_date|
+---+----+----------+------+----------+
|  2| Bob|        IT|  7000|2024-01-01|
|  5| Eva|   Finance|  7500|      NULL|
+---+----+----------+------+----------+



In [14]:
# 2. Group by department & find avg salary

avg_salry_by_group = df.groupBy(col('department')).agg(avg(col('salary')).alias('avarage_salary'))
avg_salry_by_group.show()

def avarage_salary_by_group(df):
    return df.groupBy(col('department')).agg(avg(col('salary')).alias('avarage_salary')).show()


avarage_salary_by_group(df)



+----------+--------------+
|department|avarage_salary|
+----------+--------------+
|        HR|        5000.0|
|   Finance|        6750.0|
|        IT|        7000.0|
+----------+--------------+

+----------+--------------+
|department|avarage_salary|
+----------+--------------+
|        HR|        5000.0|
|   Finance|        6750.0|
|        IT|        7000.0|
+----------+--------------+



In [None]:
# 3. Handle NULL values



