> **Question 1. **
>
![alt text](question_ss/01.jpg "question")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, col, coalesce, when

In [0]:
spark = SparkSession.builder.appName("seperate_hobbies").getOrCreate()

In [0]:
data = [('Alice', 'Badminton, Tennis'), ('Bob', 'Tennis, Cricket'), ('Julis', 'Cricket, Carrom')]
columns = ["Name", "Hobbies"]
df = spark.createDataFrame(data, columns)

In [0]:
df.show()


In [0]:
import os
print("PYSPARK_PYTHON:", os.environ.get("PYSPARK_PYTHON"))
print("PYSPARK_DRIVER_PYTHON:", os.environ.get("PYSPARK_DRIVER_PYTHON"))


In [0]:
# split by comma which convert Hobbies from str -> Array 
df.select( col("Name"), split(col("Hobbies"), ",").alias("Hobbies")).show()


In [0]:
# explode function convert array to multiple rows
df.select( col("Name"), explode(split(col("Hobbies"), ",")).alias("Hobbies")).show()

![alt text](question_ss/02.jpg "question")

In [0]:
data = [('Goa', '', 'Mumbai'), ('', 'Mumbai', None), (None, '', 'Pune')]
columns = ['city1', 'city2', 'city3']
df = spark.createDataFrame(data, columns)
df.show()

In [0]:
# This creates a new column firstnotnull that contains the first non-empty value among city1, city2, and city3 for each row.
# coalesce() Returns the first column that is not null.
df1 = df.withColumn(
    'firstnotnull', 
    coalesce(
        when(df['city1']=='', None).otherwise(df['city1']), 
        when(df['city2']=='', None).otherwise(df['city2']),
        when(df['city3']=='', None).otherwise(df['city3'])
    )
)
df1.select('firstnotnull').show()
df.show()

> **3. Claculate the % Marks for each student. Each student subject is of 100 marks. Create a result by following the below condition.**
>
![alt text](question_ss/03.jpg "question")

> Answer
>
![alt text](question_ss/03a.jpg "question")

In [0]:
data1 = [(1, "Steve"), (2, 'David'), (3, 'John'), (4, 'Shree'), (5, 'Helen')]
data2 = [(1, 'SQL', 40), (1, 'PySpark', 100), (2, 'SQL', 70), (2, 'PySpark', 60), (3, 'SQL', 30), (3, 'PySpark', 20), (4, 'SQL', 50), (4, 'PySpark', 50), (5, 'SQL', 45), (5, 'PySpark', 45)]

schema1 = ['Id', 'Name']
schema2 = ['Id', 'Subject', 'Mark']

df1 = spark.createDataFrame(data1, schema1)
df2 = spark.createDataFrame(data2, schema2)

df1.show()
df2.show()


In [0]:
# Step 1: combine both tables or dataframes by using join
df_join = df1.join(df2, df1['Id']==df2['Id']).drop(df2['Id'])
df_join.show()

In [0]:
from pyspark.sql.functions import sum, col, count

In [0]:
# Step 2: Calculating the percentage by groupby() on Id and Name column
#  then sum of marks divided by total count
df_per = df_join.groupBy('Id', 'Name').agg((sum(col('Mark'))/count('*')).alias('Percentage'))
df_per.show()

In [0]:
# Step 3: use when() and otherwise() to get Result
result = df_per.select(
    '*', 
    (
        when(df_per['Percentage'] >=70, 'Distinction')
        .when((df_per['Percentage'] < 70) & (df_per['Percentage'] >= 60), 'First Class')
        .when((df_per['Percentage'] < 60) & (df_per['Percentage'] >= 50), 'Second Class')
        .when((df_per['Percentage'] < 50) & (df_per['Percentage'] >= 40), 'Third Class')
        .when(df_per['Percentage'] < 40, 'Fail')
    ).alias('Result')
)
result.show()

> **3. Department wise nth highest salary employees.**
>
![alt text](question_ss/04.jpg "question")

> Answer
>
![alt text](question_ss/04a.jpg "question")

In [0]:
data1=[(1,"A",1000,"IT"),(2,"B",1500,"IT"),(3,"C",2500,"IT"),(4,"D",3000,"HR"),(5,"E",2000,"HR"),(6,"F",1000,"HR")
       ,(7,"G",4000,"Sales"),(8,"H",4000,"Sales"),(9,"I",1000,"Sales"),(10,"J",2000,"Sales")]
schema1=["EmpId","EmpName","Salary","DeptName"]
df=spark.createDataFrame(data1,schema1)
df.show()

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import *

In [0]:
df_rank = df.select(
    '*',
    dense_rank().over(
        Window.partitionBy(df['DeptName'])
        .orderBy(df['Salary'].desc())
    ).alias('rank')
)

df_rank.show()

In [0]:
res = df_rank.filter(df_rank.rank==1)
res.show()

> **5.**
>
![alt text](question_ss/05.jpg "question")

> Answer
>
![alt text](question_ss/05a.jpg "question")

In [0]:
# Employees Salary info
data1=[(100,"Raj",None,1,"01-04-23",50000), (200,"Joanne",100,1,"01-04-23",4000),
       (200,"Joanne",100,1,"13-04-23",4500),(200,"Joanne",100,1,"14-04-23",4020)]

schema1=["EmpId","EmpName","Mgrid","deptid","salarydt","salary"]

df_salary=spark.createDataFrame(data1,schema1)
df_salary.show()

#department dataframe
data2=[(1,"IT"), (2,"HR")]
schema2=["deptid","deptname"]

df_dept=spark.createDataFrame(data2,schema2)
df_dept.show()

In [0]:
# reformat date column 
df = df_salary.withColumn('Newsaldt', to_date(col('salarydt'), 'dd-MM-yy'))
df.show()

In [0]:
# join both df, join with only ['deptid] so that it will remove duplicate deptid col
df_join = df.join(df_dept, ['deptid'])

# Self join to get manager details
df_joined = df_join.alias('tbl1').join(
	df_join.alias('tbl2'),
	col('tbl1.Mgrid') == col('tbl2.EmpId'),
	'left'
).select(
    col('tbl1.deptname'),
	col('tbl2.EmpName').alias('ManagerName'),
	col('tbl1.EmpName').alias('EmpName'),
	col('tbl1.Newsaldt').alias('Newsaldt'),
	col('tbl1.salary').alias('salary'),
)

df_joined.show()

In [0]:
# groupby
res = df_joined.groupBy('deptname', 'ManagerName', 'EmpName', year('Newsaldt').alias('Year'), date_format('Newsaldt', 'MMM').alias('Month')).sum('salary')
res.show()

> **6. How to check data skew Issue and how to solve it.**
>
![alt text](question_ss/06.jpg "question")

> **6. Merger two dataframes**


In [0]:
simpleData = [(1, "Sagar", "CSE", "UP", 80), (2, "Shivam", "IT", "MP", 86), (3, "Muni", "Mech", "AP", 70)]

simpleData_2 = [(5, "Raj", "CSE", "HP"), (7, "Kunal", "Mech", "Rajasthan")]


columns_1 = ["ID", "Student_Name", "Department_Name", "City", "Marks"]
columns_2 = ["ID", "Student_Name", "Department_Name", "City"]

df_1 = spark.createDataFrame(data = simpleData, schema = columns_1)
df_2 = spark.createDataFrame(data = simpleData_2, schema = columns_2)
df_1.show()
df_2.show()

In [0]:
df = df_1.unionByName(df_2, allowMissingColumns=True)
df.show()