<h2><center> Hungry Geese - leaderboard exploratory data analysis (EDA) and team tracking </center></h2>

<h2><center> <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/5/52/Flying_goose.svg/236px-Flying_goose.svg.png" alt="Geese img"></center></h2>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import requests
import json
import datetime
from kaggle_environments import list_episodes
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_rows", 1500)
pd.options.display.float_format = '{:,.1f}'.format

In [None]:
f = requests.get("https://www.kaggle.com/c/hungry-geese/leaderboard.json?includeBeforeUser=true&includeAfterUser=true")
jsn = f.json()
leaderboard = pd.DataFrame(columns = ["teamName", "teamId", "teamMembers", "rank", "medal","score", "n_agents", "lastSubmission", "best_submissionId"])
for user in jsn["beforeUser"]+jsn["afterUser"]:
    leaderboard = leaderboard.append({"teamName": user["teamName"], 
                                      "teamId": user["teamId"],
                                      "teamMembers": user["teamMembers"],
                                      "rank": user["rank"],
                                      "medal": user["medal"],
                                      "score": user["score"], 
                                      "n_agents": user["entries"],
                                      "lastSubmission": datetime.datetime.strptime(user["lastSubmission"][:19], "%Y-%m-%dT%H:%M:%S"),
                                      "best_submissionId": user["submissionId"]
                                     }, 
                                     ignore_index=True)
del f, jsn, user
leaderboard = leaderboard.astype({'rank': int, 'n_agents': int, "score": float} )
gold_min_score = leaderboard[leaderboard["medal"] == "gold"]["score"].min()
silver_min_score = leaderboard[leaderboard["medal"] == "silver"]["score"].min()
bronze_min_score = leaderboard[leaderboard["medal"] == "bronze"]["score"].min()

episodes = pd.read_csv("../input/meta-kaggle/Episodes.csv")
episodes = episodes[episodes["CompetitionId"] == 25401].reset_index(drop=True)
episodes = episodes.drop(columns=["Type", "CompetitionId"])
episodes["CreateTime"] = pd.to_datetime(episodes["CreateTime"], format="%m/%d/%Y %H:%M:%S")
episodes["EndTime"] = pd.to_datetime(episodes["EndTime"], format="%m/%d/%Y %H:%M:%S")
episodes = episodes.rename(columns={"Id": "EpisodeId", "CreateTime": "EpisodeStartTime", "EndTime": "EpisodeEndTime"})
episode_agents = pd.read_csv("../input/meta-kaggle/EpisodeAgents.csv")
episode_agents = episode_agents.drop(columns=["Id", "State"])
episode_agents = pd.merge(episode_agents, episodes, on="EpisodeId").drop_duplicates()

agents_mapping = pd.DataFrame(columns = ["teamId", "SubmissionId", "SubmissionDate"])
episodes_to_consider = episode_agents.groupby(["SubmissionId"])["EpisodeId"].max().to_list()
for i in range(0, len(episodes_to_consider), 1000):
    batch = episodes_to_consider[i:(i + 1000)]
    try:
        resp = list_episodes(batch)  
        for episode in resp["result"]["submissions"]:
            agents_mapping = agents_mapping.append({"teamId": episode["teamId"],
                                "SubmissionId":  episode["id"] ,
                                "SubmissionDate": datetime.datetime.strptime(episode["dateSubmitted"][:19], "%Y-%m-%dT%H:%M:%S")
                               }, ignore_index=True)
        del episode, batch
    except Exception as ex:
        print("Error:", ex)
        continue
        
episode_agents = pd.merge(episode_agents, agents_mapping, on="SubmissionId", how="left").drop_duplicates()
episode_agents = pd.merge(episode_agents, leaderboard.loc[:, ["teamId", "teamName"]], on="teamId", how="left").drop_duplicates()
del episodes, episodes_to_consider, agents_mapping

# 1. General stats: score 

In [None]:
plt.figure(figsize=(25,8))
plt.hist(leaderboard["score"], color="mistyrose", bins=50)
plt.axvline(x=gold_min_score, color="gold")
plt.axvline(x=silver_min_score, color="silver")
plt.axvline(x=bronze_min_score, color="peru")
plt.xlabel("Team score")
plt.ylabel("Number of teams")
plt.legend(title="Team score (vertical lines are medal thresholds)", loc="upper center", title_fontsize=25)
plt.show()

In [None]:
print("Descriptive statistics for team score:", leaderboard["score"].describe(), sep='\n')

In [None]:
day_stat = episode_agents.groupby(episode_agents["EpisodeStartTime"].dt.date)["UpdatedScore"].describe().drop(columns=["std", "count"])
day_stat.index.name = 'date'
day_stat.plot(figsize=(25,10), title="Score summary statistics for individual agents (data for the last day is incomplete)")
plt.show()
del day_stat

# 2. General stats: participation

In [None]:
plt.figure(figsize=(25,7))
plt.hist(leaderboard["n_agents"], color="beige", bins=int(len(leaderboard["n_agents"].unique())/2))
plt.xlabel("Team agents")
plt.ylabel("Number of teams")
plt.legend(title="Number of submissions (agents)", loc="upper center", title_fontsize=25)
plt.show()

In [None]:
print("Descriptive statistics for number of submissions (agents):", leaderboard["n_agents"].describe(), sep='\n')

In [None]:
plt.figure(figsize=(25,8))
plt.hist(leaderboard["lastSubmission"], color="lightblue", bins=int(len(leaderboard["lastSubmission"].unique())/3))
plt.xlabel("Last submission time")
plt.ylabel("Number of teams")
plt.legend(title="Last submission date by team", loc="upper center", title_fontsize=25)
plt.show()

In [None]:
episode_agents.groupby(["SubmissionId"]).head(1)["SubmissionDate"].dt.date.value_counts().\
    plot(figsize=(25,7), title="Number of new submissions by date (data for the last day is incomplete)", color="brown")
plt.show()

In [None]:
episode_agents.groupby("teamId")["SubmissionDate"].min().dt.date.value_counts().\
    plot(figsize=(25,7), title="Number of new teams by date (data for the last day is incomplete)", color="darkorange")
plt.show()

# 3. Number of submissions vs team score

In [None]:
print("Correlation coef.:", np.corrcoef(leaderboard["n_agents"], leaderboard["score"])[0, 1])

In [None]:
sns.set(style="white")
sns.lmplot("n_agents", "score", data=leaderboard, scatter_kws={"alpha": 0.5}, line_kws={"color": "green"}, height=10)
plt.legend(title="Number of agents vs team score", loc="lower center", title_fontsize = 25)
plt.show()

In [None]:
sns.set(style="white")
sns.lmplot("n_agents", "score", data=leaderboard[leaderboard["rank"] <= 100], 
           scatter_kws={"alpha": 0.5}, line_kws={"color": "green"}, height=8)
plt.legend(title="Number of agents vs team score (medal-winning teams)", loc="lower center", title_fontsize = 25)
plt.show()

# 4. Geese index stats

[@chankhavu](https://www.kaggle.com/chankhavu) suggests that there is [unfair advantage](https://www.kaggle.com/c/hungry-geese/discussion/228967) for geese with smaller index. Nevertheless, this advantage seems to have small (if any) effect on the outcome (game reward statistics):

In [None]:
episode_agents.groupby(["Index"])["Reward"].describe().reset_index(drop=False).drop(columns=['count'])

In [None]:
episode_agents.groupby("Index").boxplot(column="Reward", subplots=False, figsize=(25,7), grid=False, 
                                        color={"medians": "DarkBlue", "boxes": "DarkGreen"})
plt.show()

# 5. Score distribution by team

*NB: team members are clickable*

In [None]:
last_submissions = episode_agents.sort_values(["SubmissionId", "EpisodeStartTime"], ascending=False).drop_duplicates(["SubmissionId"])
team_rating = pd.merge(
    leaderboard.loc[:, ["teamId", "teamName", "teamMembers", "n_agents"]],
    last_submissions.groupby(["teamId"])["UpdatedScore"].describe().drop(columns=["count", "std"]).\
        rename(columns={"mean": "mean score", "min": "min score", "25%": "Q1 score", "50%": "median score", "75%": "Q3 score", "max": "max score"}).\
        reset_index(level=0),
    on="teamId")
team_rating = pd.merge(team_rating, last_submissions[last_submissions["UpdatedScore"]>=gold_min_score].groupby("teamId")["EpisodeId"].count().\
                       reset_index(level=0).rename(columns={"EpisodeId": "gold medal agents"}), on="teamId", how="left")
team_rating = pd.merge(team_rating, last_submissions[last_submissions["UpdatedScore"]>=silver_min_score].groupby("teamId")["EpisodeId"].count().\
                       reset_index(level=0).rename(columns={"EpisodeId": "silver medal agents"}), on="teamId", how="left")
team_rating = pd.merge(team_rating, last_submissions[last_submissions["UpdatedScore"]>=bronze_min_score].groupby("teamId")["EpisodeId"].count().\
                       reset_index(level=0).rename(columns={"EpisodeId": "bronze medal agents"}), on="teamId", how="left")
team_rating = team_rating.fillna({"gold medal agents": 0, "silver medal agents": 0, "bronze medal agents": 0})
del last_submissions
team_rating = team_rating.drop(columns=["teamId"])
team_rating.sort_values("max score", ascending=False).style.format({
    "teamMembers": lambda x: "".join(f'<div style="float: right; margin: -4px 2px;"><a href="https://www.kaggle.com{xi["profileUrl"]}"><img src="{xi["thumbnailUrl"]}" width="24" height="24" alt="{xi["displayName"]}"></a></div>' for xi in x)
    ,"mean score": '{:,.1f}'.format
    ,"min score": '{:,.1f}'.format
    ,"Q1 score": '{:,.1f}'.format
    ,"median score": '{:,.1f}'.format
    ,"Q3 score": '{:,.1f}'.format
    ,"max score": '{:,.1f}'.format  
    ,"gold medal agents": '{:,.0f}'.format
    ,"silver medal agents": '{:,.0f}'.format
    ,"bronze medal agents": '{:,.0f}'.format
    }).background_gradient(subset=['mean score', 'min score', 'Q1 score', 'median score', 'Q3 score', 'max score'], cmap="Wistia").\
    background_gradient(subset=['gold medal agents', 'silver medal agents', 'bronze medal agents'], cmap="YlGn").hide_index()

# 6. Agents grouped by the day of submission

In [None]:
score_by_day_final = episode_agents.sort_values(["SubmissionId", "EpisodeStartTime"], ascending=False).drop_duplicates(["SubmissionId"]).\
    groupby([episode_agents["SubmissionDate"].dt.date])["UpdatedScore"].mean().reset_index().\
    rename(columns={"UpdatedScore": "Recent score (average by date of submission)"}).dropna()
plt.figure(figsize=(25, 7))
sns.lineplot(data=score_by_day_final, x="SubmissionDate", y="Recent score (average by date of submission)", color="rosybrown", legend=False).\
    set_title("Average final score by date of submission")
plt.show()
del score_by_day_final

In [None]:
score_by_day = episode_agents.groupby([episode_agents["SubmissionDate"].dt.date, episode_agents["EpisodeStartTime"].dt.date])["UpdatedScore"].\
    mean().reset_index().rename(columns={"EpisodeStartTime": "EpisodeDate", "UpdatedScore": "Average score"})
plt.figure(figsize=(25, 10))
sns.lineplot(data=score_by_day, x="EpisodeDate", y="Average score", hue="SubmissionDate", legend=False).\
    set_title("Dynamics of average agent score by date of submission")
plt.show()
del score_by_day

In [None]:
score_by_day_final = episode_agents.sort_values(["SubmissionId", "EpisodeStartTime"], ascending=False).drop_duplicates(["SubmissionId"]).\
    groupby([episode_agents["SubmissionDate"].dt.date])["UpdatedScore"].max().reset_index().\
    rename(columns={"UpdatedScore": "Recent score (max by date of submission)"}).dropna()
plt.figure(figsize=(25, 7))
sns.lineplot(data=score_by_day_final, x="SubmissionDate", y="Recent score (max by date of submission)", color="brown", legend=False).\
    set_title("Maximum final agent score by date of submission - max")
plt.show()
del score_by_day_final

In [None]:
score_by_day = episode_agents.groupby([episode_agents["SubmissionDate"].dt.date, episode_agents["EpisodeStartTime"].dt.date])["UpdatedScore"].\
    max().reset_index().rename(columns={"EpisodeStartTime": "EpisodeDate", "UpdatedScore": "Max score"})
plt.figure(figsize=(25, 10))
sns.lineplot(data=score_by_day, x="EpisodeDate", y="Max score", hue="SubmissionDate", legend=False).\
    set_title("Dynamics of max agent score by date of submission")
plt.show()
del score_by_day

# 7. Top-100 agents

In [None]:
top100_agents = episode_agents.sort_values(["SubmissionId", "EpisodeStartTime"], ascending=False).drop_duplicates(["SubmissionId"]).\
    sort_values(by=["UpdatedScore"], ascending=False).head(100).reset_index(drop=True).rename(columns={"UpdatedScore": "Score"})
top100_agents.loc[:, ["teamName", "SubmissionId", "Score"]].style.background_gradient(subset=["Score"], cmap="Wistia")

In [None]:
plt.figure(figsize=(20, 20))
ax = sns.countplot(y=top100_agents['teamName'], 
              order=top100_agents['teamName'].value_counts().index, 
              palette="Set3")
for p, label in zip(ax.patches, top100_agents['teamName'].value_counts()):
    ax.text(p.get_width() + 0.1, p.get_y()+0.5, label, ha="center") 
plt.legend(title="TOP-100 agents by team", loc="center right", title_fontsize = 15)
plt.show()

# 8. Team score tracking

In [None]:
team_day_stat = episode_agents.groupby([episode_agents["teamName"], episode_agents["EpisodeStartTime"].dt.date])["UpdatedScore"].\
    describe().reset_index().drop(columns=["std", "count"]).rename(columns={"EpisodeStartTime": "EpisodeDate", "mean": "mean score", 
                                        "min": "min score", "25%": "Q1 score", "50%": "median score", "75%": "Q3 score", "max": "max score"})
teams = leaderboard[leaderboard["teamName"].isin(episode_agents["teamName"])].sort_values("rank")["teamName"].to_list()
team_day_stat = team_day_stat[team_day_stat["EpisodeDate"] != team_day_stat["EpisodeDate"].max()] # the data for the last day is incomplete
default_team = teams[0]
layout = go.Layout(
    title='Team score tracking',    
    width=1400,
    height=800,
    xaxis=dict(title='Date'),
    yaxis=dict(title='Score statistics'),

)
fig=go.Figure(layout=layout)
team_plt_names = []
buttons=[]
for team in teams:
    dt_team = team_day_stat[team_day_stat["teamName"] == team]
    for col in ["mean score", "median score", "min score", "max score", "Q1 score", "Q3 score"]:
        fig.add_trace(go.Scatter(x=dt_team["EpisodeDate"], 
                                 y=dt_team[col], 
                                 mode='lines',
                                 visible=(team==default_team),
                                 name=col
                                ))
        team_plt_names.extend([team])

for team in teams:
    buttons.append(dict(method='update',
                        label=team,
                        args = [{'visible': [team==t for t in team_plt_names]}]))
fig.update_layout(showlegend=False, updatemenus=[{"buttons": buttons, 
                                                  "direction": "down", 
                                                  "active": teams.index(default_team), 
                                                  "showactive": True, "x": 0.5, "y": 1.1}],
                 template="simple_white")
fig.show()
del team_day_stat

In [None]:
last_agent_score = episode_agents.sort_values(["SubmissionId", "EpisodeStartTime"], ascending=False).drop_duplicates(["SubmissionId"]).\
    loc[:, ["teamName", "SubmissionId", "UpdatedScore"]].reset_index(drop=True).rename(columns={"UpdatedScore": "Recent Score"}).dropna(subset=["Recent Score"])
teams = leaderboard[leaderboard["teamName"].isin(episode_agents["teamName"])].sort_values("rank")["teamName"].to_list()
default_team = teams[0]
layout = go.Layout(
    title="Team agents\' score distribution",
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    width=1400,
    height=800,
    xaxis=dict(title='Last (final) score', linecolor="black"),
    yaxis=dict(title='Number of agents', linecolor="black"), # , dtick=1
)
fig=go.Figure(layout=layout)
team_plt_names = []
buttons=[]
for team in teams:
    dt_team = last_agent_score[last_agent_score["teamName"] == team].reset_index(drop=True)
    fig.add_trace(go.Histogram(x=dt_team["Recent Score"],
                               visible=(team==default_team),
                               name=team,
                               opacity=0.75,
                               histnorm="", #probability density
                               marker_color='darksalmon',
                            ))
    team_plt_names.extend([team])

for team in teams:
    buttons.append(dict(method='update',
                        label=team,
                        args = [{'visible': [team==t for t in team_plt_names]}]))
fig.update_layout(showlegend=False, updatemenus=[{"buttons": buttons, 
                                                  "direction": "down", 
                                                  "active": teams.index(default_team), 
                                                  "showactive": True, "x": 0.5, "y": 1.1}])
fig.add_vline(x=gold_min_score, line_color="gold")
fig.add_vline(x=silver_min_score, line_color="silver")
fig.add_vline(x=bronze_min_score, line_color="peru")
fig.show()
del last_agent_score

In [None]:
teams = leaderboard[leaderboard["teamName"].isin(episode_agents["teamName"])].sort_values("rank")["teamName"].to_list()
default_team = teams[0]
layout = go.Layout(
    title='Team agents\' tracking',
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    width=1400,
    height=800,
    xaxis=dict(title="Episodes\' date and time", linecolor="black"),
    yaxis=dict(title='Scores', linecolor="black"),
)
fig=go.Figure(layout=layout)
team_plt_names = []
buttons=[]
for team in teams:
    dt_team = episode_agents[episode_agents["teamName"] == team].dropna(subset=["EpisodeStartTime", "UpdatedScore"]).reset_index(drop=True)
    dt_team = dt_team.sort_values(["SubmissionId", "EpisodeStartTime"], ascending=False)
    for submission in dt_team["SubmissionId"].unique().tolist():
        fig.add_trace(go.Scatter(x=dt_team.loc[dt_team["SubmissionId"]==submission, "EpisodeStartTime"], 
                                 y=dt_team.loc[dt_team["SubmissionId"]==submission, "UpdatedScore"], 
                                 #mode='lines',
                                 visible=(team==default_team),
                                 name=str(submission)
                                ))
        team_plt_names.extend([team])
        
for team in teams:
    buttons.append(dict(method='update',
                        label=team,
                        args = [{'visible': [team==t for t in team_plt_names]}]))
fig.update_layout(showlegend=False, updatemenus=[{"buttons": buttons, 
                                                  "direction": "down", 
                                                  "active": teams.index(default_team), 
                                                  "showactive": True, "x": 0.5, "y": 1.1}])
fig.add_hline(y=gold_min_score, line_color="gold")
fig.add_hline(y=silver_min_score, line_color="silver")
fig.add_hline(y=bronze_min_score, line_color="peru")
fig.show()