# A single PySpark script with different functions for:

- Reading data (CSV, Parquet)

- Transformation (filter, join, groupBy, agg)

- Handling nulls

- Partitioning & Repartitioning

- Using UDF

- Writing data

- Explaining optimization concepts

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col, udf
from pyspark.sql.window import Window 

In [2]:
## Create Spark Session
spark = SparkSession.builder.appName('Interview_Askable_Functions').getOrCreate()

In [3]:
## Sample Data
data = [
    (1, "Alice", "HR", 5000, None),
    (2, "Bob", "IT", 7000, "2024-01-01"),
    (3, "Charlie", "Finance", 6000, "2024-01-05"),
    (4, "David", "IT", None, "2024-02-01"),
    (5, "Eva", "Finance", 7500, None)
]

columns = ["id", "name", "department", "salary", "join_date"]

df = spark.createDataFrame(data, columns)

df.show()

+---+-------+----------+------+----------+
| id|   name|department|salary| join_date|
+---+-------+----------+------+----------+
|  1|  Alice|        HR|  5000|      NULL|
|  2|    Bob|        IT|  7000|2024-01-01|
|  3|Charlie|   Finance|  6000|2024-01-05|
|  4|  David|        IT|  NULL|2024-02-01|
|  5|    Eva|   Finance|  7500|      NULL|
+---+-------+----------+------+----------+



In [4]:
# 1. Filter employees with salary > 6000
high_salary_df = df.filter(col('salary')> 6000)
high_salary_df.show()

def filter_high_salary(df):
    return df.filter(col('salary') > 6000)

filter_high_salary(df).show()


+---+----+----------+------+----------+
| id|name|department|salary| join_date|
+---+----+----------+------+----------+
|  2| Bob|        IT|  7000|2024-01-01|
|  5| Eva|   Finance|  7500|      NULL|
+---+----+----------+------+----------+

+---+----+----------+------+----------+
| id|name|department|salary| join_date|
+---+----+----------+------+----------+
|  2| Bob|        IT|  7000|2024-01-01|
|  5| Eva|   Finance|  7500|      NULL|
+---+----+----------+------+----------+



In [5]:
# 2. Group by department & find avg salary

avg_salry_by_group = df.groupBy(col('department')).agg(avg(col('salary')).alias('avarage_salary'))
avg_salry_by_group.show()

def avarage_salary_by_group(df):
    return df.groupBy(col('department')).agg(avg(col('salary')).alias('avarage_salary')).show()


avarage_salary_by_group(df)



+----------+--------------+
|department|avarage_salary|
+----------+--------------+
|        HR|        5000.0|
|   Finance|        6750.0|
|        IT|        7000.0|
+----------+--------------+

+----------+--------------+
|department|avarage_salary|
+----------+--------------+
|        HR|        5000.0|
|   Finance|        6750.0|
|        IT|        7000.0|
+----------+--------------+



In [6]:
# 3. Handle NULL values

fillNullDefaultValue = df.fillna({"salary":0, "join_date":"1900-01-01"})

fillNullDefaultValue.show()

def fillNullDefaultValueFun(df):
    return df.fillna({"salary": 0, "join_date":"1900-01-01"})

fillNullDefaultValueFun(df).show()


+---+-------+----------+------+----------+
| id|   name|department|salary| join_date|
+---+-------+----------+------+----------+
|  1|  Alice|        HR|  5000|1900-01-01|
|  2|    Bob|        IT|  7000|2024-01-01|
|  3|Charlie|   Finance|  6000|2024-01-05|
|  4|  David|        IT|     0|2024-02-01|
|  5|    Eva|   Finance|  7500|1900-01-01|
+---+-------+----------+------+----------+

+---+-------+----------+------+----------+
| id|   name|department|salary| join_date|
+---+-------+----------+------+----------+
|  1|  Alice|        HR|  5000|1900-01-01|
|  2|    Bob|        IT|  7000|2024-01-01|
|  3|Charlie|   Finance|  6000|2024-01-05|
|  4|  David|        IT|     0|2024-02-01|
|  5|    Eva|   Finance|  7500|1900-01-01|
+---+-------+----------+------+----------+



In [7]:
# Create Department Data Frame
dept_data = [("HR", "Human Resource"), ("IT", "Information Tech"), ("Finance", "Accounts")]
dept_column = ["dept", "dept_fullName"]
dept_df = spark.createDataFrame(dept_data, dept_column)
dept_df.show()






+-------+----------------+
|   dept|   dept_fullName|
+-------+----------------+
|     HR|  Human Resource|
|     IT|Information Tech|
|Finance|        Accounts|
+-------+----------------+



In [8]:
# Joining the table
joinTable_df = df.join(dept_df, df["department"] == dept_df["dept"], "left").select(col("id"),col("name"),col("department"),col("salary"),col("join_date"),col("dept_fullName"))
joinTable_df.show()





+---+-------+----------+------+----------+----------------+
| id|   name|department|salary| join_date|   dept_fullName|
+---+-------+----------+------+----------+----------------+
|  1|  Alice|        HR|  5000|      NULL|  Human Resource|
|  2|    Bob|        IT|  7000|2024-01-01|Information Tech|
|  3|Charlie|   Finance|  6000|2024-01-05|        Accounts|
|  4|  David|        IT|  NULL|2024-02-01|Information Tech|
|  5|    Eva|   Finance|  7500|      NULL|        Accounts|
+---+-------+----------+------+----------+----------------+



In [13]:
# 5. Partition & Repartition Example
repartDf = df.repartition(10, "department") # shuffle partitioning
repartDf.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange hashpartitioning(department#474, 10), REPARTITION_BY_NUM, [plan_id=199]
   +- LocalTableScan [id#472L, name#473, department#474, salary#475L, join_date#476]




In [14]:
coalsecDF = repartDf.coalesce(2)
coalsecDF.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Coalesce 2
   +- Exchange hashpartitioning(department#480, 10), REPARTITION_BY_NUM, [plan_id=210]
      +- LocalTableScan [id#478L, name#479, department#480, salary#481L, join_date#482]




In [None]:
# 6. UDF Example

def upper_case(name):
    return name.upper() if name else None


upper_udf = udf(upper_case, StringType())

upper_df = df.withColumn("name_upper", upper_udf(col("name")))

upper_df.show()



In [None]:
# 7. Write to Parquet

df.write.mode("overwrite").format("parquet").save('./temp/interview')

#df.coalesce(1).write.mode("overwrite").parquet('./temp/interview')


In [None]:
df = spark.read.option("mode", "DROPMALFORMED").csv("path/to/data.csv", schema=schema)