#Problem Statement
The winner in each group is the player who scored the maximum total points within the group. In the case of a tie, the lowest player_id wins.

Write a solution to find the winner in each group.

Return the result table in any order.

In [0]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("PlayersMatches") \
    .getOrCreate()
from pyspark.sql import Row

# Create players DataFrame
players_data = [
    Row(player_id=15, group_id=1),
    Row(player_id=25, group_id=1),
    Row(player_id=30, group_id=1),
    Row(player_id=45, group_id=1),
    Row(player_id=10, group_id=2),
    Row(player_id=35, group_id=2),
    Row(player_id=50, group_id=2),
    Row(player_id=20, group_id=3),
    Row(player_id=40, group_id=3)
]

players_df = spark.createDataFrame(players_data)

# Create matches DataFrame
matches_data = [
    Row(match_id=1, first_player=15, second_player=45, first_score=3, second_score=0),
    Row(match_id=2, first_player=30, second_player=25, first_score=1, second_score=2),
    Row(match_id=3, first_player=30, second_player=15, first_score=2, second_score=0),
    Row(match_id=4, first_player=40, second_player=20, first_score=5, second_score=2),
    Row(match_id=5, first_player=35, second_player=50, first_score=1, second_score=1)
]

matches_df = spark.createDataFrame(matches_data)

# Show the DataFrames
players_df.display()
matches_df.display()

player_id,group_id
15,1
25,1
30,1
45,1
10,2
35,2
50,2
20,3
40,3


match_id,first_player,second_player,first_score,second_score
1,15,45,3,0
2,30,25,1,2
3,30,15,2,0
4,40,20,5,2
5,35,50,1,1


###Pyspark

In [0]:
from pyspark.sql.functions import col, sum as _sum, rank
from pyspark.sql.window import Window
first_player_scores = matches_df.groupBy("first_player").agg(_sum("first_score").alias("score")).withColumnRenamed("first_player", "player_id")
second_player_scores = matches_df.groupBy("second_player").agg(_sum("second_score").alias("score")).withColumnRenamed("second_player", "player_id")

# Union the two results
player_score_df = first_player_scores.unionByName(second_player_scores)

# Step 2: Create final_score DataFrame
final_score_df = player_score_df.join(players_df, "player_id", "inner") \
    .groupBy("group_id", "player_id") \
    .agg(_sum("score").alias("score"))


# Step 3: Create final_ranking DataFrame
window_spec = Window.partitionBy("group_id").orderBy(col("score").desc(), col("player_id").asc())
final_ranking_df = final_score_df.withColumn("rn", rank().over(window_spec))

# Step 4: Select top-ranked players from each group
top_ranked_players_df = final_ranking_df.filter(col("rn") == 1)
top_ranked_players_df.display()

group_id,player_id,score,rn
1,15,3,1
2,35,1,1
3,40,5,1


###Spark SQL

In [0]:
players_df.createOrReplaceTempView("players")
matches_df.createOrReplaceTempView("matches")

In [0]:
%sql
with player_score as (
select first_player as player_id, sum(first_score) as score from matches group by first_player
union all
select second_player as player_id, sum(second_score) as score from matches group by second_player)
,final_score as(
select pl.group_id,ps.player_id,sum(score) as score from player_score ps inner join players pl 
on ps.player_id = pl.player_id 
group by pl.group_id,ps.player_id
)
,final_ranking as(
select *, rank() over(partition by group_id order by score desc, player_id asc) as rn
from final_score
)
select * from final_ranking where rn = 1
;

group_id,player_id,score,rn
1,15,3,1
2,35,1,1
3,40,5,1
