In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from google.colab import drive
from os import *

# Mount Google Drive with a longer timeout
# drive.mount('/content/drive', force_remount=True, timeout_ms=300000)

# df_employee_data = "/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv"
# employeeSechema = StructType([
#     StructField("ID",IntegerType() ,True),
#     StructField("Name",StringType() ,True),
#     StructField("Age",IntegerType() ,True),
#     StructField("Salary",FloatType() ,True),
#     StructField("Joining_Date",DateType() ,True),
#     StructField("Department",StringType() ,True),
#     StructField("Performance_Rating",IntegerType() ,True),
#     StructField("Email",StringType() ,True),
#     StructField("Address",StringType() ,True),
#     StructField("Phone",StringType() ,True)

# ])
# # Load the DataFrame with the defined schema
# #df = spark.read.csv(path=df_employee_data, header=True, schema=employeeSechema)
# df = spark.read.load(path="/content/drive/MyDrive/Colab Notebooks/dataSet/employee_data.csv", format="csv", header = True, schema=employeeSechema)
# df.printSchema()
# df.show(50)

### Create NewSpark Session

In [14]:
spark = SparkSession.builder.appName('StrongAndStringFunction').getOrCreate()

In [15]:
#Sample Data
data = [

        (1, "Alice", "HR", "Communication Management"),
        (2, "Bob", "IT", "Programming Networking"),
        (3, "Charlie", "Finance", "Accounting Analysis"),
        (4, "David", "HR", "Recruiting Communication"),
        (5, "Eve", "IT", "Cloud DevOps")
 ]

# Define Schema
columns = ["EmployeeID", "Name", "Department", "Skills"]

#Create Dataframe
df = spark.createDataFrame(data=data, schema=columns)

# Display Orifinal Data Frame
df.show(truncate=False)



+----------+-------+----------+------------------------+
|EmployeeID|Name   |Department|Skills                  |
+----------+-------+----------+------------------------+
|1         |Alice  |HR        |Communication Management|
|2         |Bob    |IT        |Programming Networking  |
|3         |Charlie|Finance   |Accounting Analysis     |
|4         |David  |HR        |Recruiting Communication|
|5         |Eve    |IT        |Cloud DevOps            |
+----------+-------+----------+------------------------+



## 1. Split the "Skills" column:

In [26]:
#Split the "skills" column and alise it skills_ARRAY
df2 =df.select(col("EmployeeID"), col("Name"), split(col("Skills"), " ").alias("skills_Array"))
df2.show(truncate=False)

#Note: This splits the Skills column into an array of skills based on the space separator. The alias("Skills_Array") gives the resulting array a meaningful name.

+----------+-------+---------------------------+
|EmployeeID|Name   |skills_Array               |
+----------+-------+---------------------------+
|1         |Alice  |[Communication, Management]|
|2         |Bob    |[Programming, Networking]  |
|3         |Charlie|[Accounting, Analysis]     |
|4         |David  |[Recruiting, Communication]|
|5         |Eve    |[Cloud, DevOps]            |
+----------+-------+---------------------------+



## 2. Select the first skill from the "Skills_Array":

In [27]:
df2.select(col("EmployeeID"),col("Name"),col("skills_Array")[0].alias("first_skill")).show()

#Note: The array index starts from 0, so Skills_Array[0] gives the first skill for each employee

+----------+-------+-------------+
|EmployeeID|   Name|  first_skill|
+----------+-------+-------------+
|         1|  Alice|Communication|
|         2|    Bob|  Programming|
|         3|Charlie|   Accounting|
|         4|  David|   Recruiting|
|         5|    Eve|        Cloud|
+----------+-------+-------------+



## 3. Calculate the size of the "Skills_Array":

In [28]:
df2.select(col('EmployeeID'), col('Name'), size('skills_Array').alias('Size_of_Array')).show()

+----------+-------+-------------+
|EmployeeID|   Name|Size_of_Array|
+----------+-------+-------------+
|         1|  Alice|            2|
|         2|    Bob|            2|
|         3|Charlie|            2|
|         4|  David|            2|
|         5|    Eve|            2|
+----------+-------+-------------+



## 4. Check if the array contains a specific skill:

In [29]:
#Check if the "skill_array" the skill "cloud"

df.select(
    col("EmployeeID"),
    col("Name"),
    array_contains(
        split(col("Skills")," "),"Cloud").alias("Has_cloud_skill")
    ).show(truncate=False)

#Note: This returns a boolean indicating whether the array contains the specified skill, "Cloud", for each employee.

+----------+-------+---------------+
|EmployeeID|Name   |Has_cloud_skill|
+----------+-------+---------------+
|1         |Alice  |false          |
|2         |Bob    |false          |
|3         |Charlie|false          |
|4         |David  |false          |
|5         |Eve    |true           |
+----------+-------+---------------+



## Use the explode function to transform array elements into individual rows:

In [30]:
df3 = df2.withColumn("Skills", explode(col("skills_Array")))
df3.select("EmployeeID", "Name", "Skills").show(truncate= False)

#Note: The explode() function takes an array column and creates a new row for each element of the array. Here, each employee will have multiple rows, one for each skill.

+----------+-------+-------------+
|EmployeeID|Name   |Skills       |
+----------+-------+-------------+
|1         |Alice  |Communication|
|1         |Alice  |Management   |
|2         |Bob    |Programming  |
|2         |Bob    |Networking   |
|3         |Charlie|Accounting   |
|3         |Charlie|Analysis     |
|4         |David  |Recruiting   |
|4         |David  |Communication|
|5         |Eve    |Cloud        |
|5         |Eve    |DevOps       |
+----------+-------+-------------+

