In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from google.colab import drive

# Mount Google Drive with a longer timeout
# drive.mount('/content/drive', force_remount=True, timeout_ms=300000)

# df_employee_data = "/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv"
# employeeSechema = StructType([
#     StructField("ID",IntegerType() ,True),
#     StructField("Name",StringType() ,True),
#     StructField("Age",IntegerType() ,True),
#     StructField("Salary",FloatType() ,True),
#     StructField("Joining_Date",DateType() ,True),
#     StructField("Department",StringType() ,True),
#     StructField("Performance_Rating",IntegerType() ,True),
#     StructField("Email",StringType() ,True),
#     StructField("Address",StringType() ,True),
#     StructField("Phone",StringType() ,True)

# ])
# # Load the DataFrame with the defined schema
# #df = spark.read.csv(path=df_employee_data, header=True, schema=employeeSechema)
# df = spark.read.load(path="/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv", format="csv", header = True, schema=employeeSechema)
# df.printSchema()
# df.show(50)

### Create NewSpark Session

In [None]:
spark = SparkSession.builder.appName('StrongAndStringFunction').getOrCreate()

In [7]:
#Sample Data
data = [

        (1, " Alice ", "HR"),
        (2, " Bob", "IT"),
        (3, "Charlie ", "Finance"),
        (4, " David ", "HR"),
        (5, "Eve ", "IT")
 ]

# Define Schema
columns = ["EmployeeID", "Name", "Department"]

#Create Dataframe
df = spark.createDataFrame(data=data, schema=columns)

# Display Orifinal Data Frame
df.show(truncate=False)



+----------+--------+----------+
|EmployeeID|Name    |Department|
+----------+--------+----------+
|1         | Alice  |HR        |
|2         | Bob    |IT        |
|3         |Charlie |Finance   |
|4         | David  |HR        |
|5         |Eve     |IT        |
+----------+--------+----------+



## Applying Trimming and Padding Functions
1. ltrim(), rtrim(), and trim():

  * ltrim(): Removes leading spaces.
  * rtrim(): Removes trailing spaces.
  * trim(): Removes both leading and trailing spaces.

2. lpad() and rpad():

  * lpad(): Pads the left side of a string with a specified character up to a certain length.
  * rpad(): Pads the right side of a string with a specified character up to a certain length.

In [16]:
from os import truncate
# Apply trimming and padding functions

max_length = df.select(max(length(col("Name")))).collect()[0][0]
result_df = df.select(
  col("EmployeeID"),
  col("Department"),
  ltrim(col("Name")).alias("ltrim_name"), # Remove leading space
  rtrim(col("Name")).alias("rtrim_name"), # Remove traling spaces
  trim(col("Name")).alias("trim_name"), # Remove both leading and trailing spaces
  lpad(col("Name"), 10, "X").alias("lpad_name"), #left pad with X to make the string length 10
  rpad(col("Name"), 10, "Y").alias("rpad_name"), #Right pad with Y to make the string length 10
  rpad(col("Name"), max_length, "_").alias("rpad_name") # Right pad to max length in column dynamically
)

result_df.show(truncate=False)


+----------+----------+----------+----------+---------+----------+----------+---------+
|EmployeeID|Department|ltrim_name|rtrim_name|trim_name|lpad_name |rpad_name |rpad_name|
+----------+----------+----------+----------+---------+----------+----------+---------+
|1         |HR        |Alice     | Alice    |Alice    |XXX Alice | Alice YYY| Alice _ |
|2         |IT        |Bob       | Bob      |Bob      |XXXXXX Bob| BobYYYYYY| Bob____ |
|3         |Finance   |Charlie   |Charlie   |Charlie  |XXCharlie |Charlie YY|Charlie  |
|4         |HR        |David     | David    |David    |XXX David | David YYY| David _ |
|5         |IT        |Eve       |Eve       |Eve      |XXXXXXEve |Eve YYYYYY|Eve ____ |
+----------+----------+----------+----------+---------+----------+----------+---------+



###Output Explanation:
* ltrim_Name: The leading spaces from the Name column are removed.
* rtrim_Name: The trailing spaces from the Name column are removed.
* trim_Name: Both leading and trailing spaces are removed from the Name column.
* lpad_Name: The Name column is padded on the left with "X" until the string length becomes 10.
* rpad_Name: The Name column is padded on the right with "Y" until the string length becomes 10.