Problem Statement

Write a query to find PersonID, Name, number of friend, sum of marks of person who have friend with total score greater than 100.

In [0]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("SQL to PySpark").getOrCreate()


# Define schema for friend table
friend_schema = StructType([
    StructField("pid", IntegerType(), True),
    StructField("fid", IntegerType(), True)
])

# Define schema for person table
person_schema = StructType([
    StructField("PersonID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Score", IntegerType(), True)
])

# Create friend DataFrame
friend_data = [
    (1, 2),
    (1, 3),
    (2, 1),
    (2, 3),
    (3, 5),
    (4, 2),
    (4, 3),
    (4, 5)
]
friend_df = spark.createDataFrame(data=friend_data, schema=friend_schema)

# Create person DataFrame
person_data = [
    (1, "Alice", 88),
    (2, "Bob", 11),
    (3, "Devis", 27),
    (4, "Tara", 45),
    (5, "John", 63)
]
person_df = spark.createDataFrame(data=person_data, schema=person_schema)


In [0]:
person_df = (spark.read
  .format("csv")
  .option("header", "true")
  .option("mode", "PERMISSIVE")
  .load("/FileStore/person.csv")
)
friend_df = (spark.read
  .format("csv")
  .option("header", "true")
  .option("mode", "PERMISSIVE")
  .load("/FileStore/friend.csv")
)

In [0]:
friend_df.display()
person_df.display()

PersonID,FriendID
1,2
1,3
2,1
2,3
3,5
4,2
4,3
4,5


PersonID,Name,Email,Score
1,Alice,alice2018@hotmail.com,88
2,Bob,bob2018@hotmail.com,11
3,Davis,davis2018@hotmail.com,27
4,Tara,tara2018@hotmail.com,45
5,John,john2018@hotmail.com,63


In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import col, count, sum
# Join person_df with friend_df on PersonID
joined_df = person_df.alias('p').join(friend_df.alias('f'), col('p.PersonID') == col('f.PersonID'))

# Join the result with person_df again to get friend's details
joined_df = joined_df.join(person_df.alias('p2'), col('f.FriendID') == col('p2.PersonID'))

# Select the required columns
cte_df = joined_df.select(
    col('p.PersonID'),
    col('p.Name'),
    col('f.FriendID'),
    col('p2.Name').alias('fname'),
    col('p2.Score').alias('fscore')
)

# Perform the group by and aggregate operations
result_df = cte_df.groupBy('PersonID', 'Name') \
    .agg(
        count('*').alias('no_of_friends'),
        sum('fscore').alias('fscores')
    ) \
    .filter(col('fscores') > 100)

# Show the result
result_df.display()


PersonID,Name,no_of_friends,fscores
4,Tara,3,101.0
2,Bob,2,115.0


In [0]:
person_df.createOrReplaceTempView("person")
friend_df.createOrReplaceTempView("friend")

In [0]:
%sql
with cte as(
select p.*,f.FriendID,p2.Name as fname,p2.score as fscore from person p
inner join 
friend f on f.personid = p.PersonID
inner join person p2 on f.FriendID = p2.PersonID
order by p.personid)
select personid,name,count(*) as no_of_friends,sum(fscore) as fscores from cte group by personid,name
having fscores>100

personid,name,no_of_friends,fscores
2,Bob,2,115.0
4,Tara,3,101.0
