In [None]:
!pip install duckdb

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import seaborn as sns
import datetime
import duckdb
import warnings

In [None]:
%config InlineBackend.figure_format = 'svg'
sns.set_context("notebook", font_scale=1)
plt.rcParams.update({
    'figure.figsize': (10, 4),
    'axes.facecolor': 'white',
    'figure.facecolor': 'white'
})
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
warnings.filterwarnings("ignore")

In [None]:
# load data

df_competitions = pd.read_csv('/kaggle/input/meta-kaggle/Competitions.csv')
df_teams = pd.read_csv('/kaggle/input/meta-kaggle/Teams.csv')
df_users = pd.read_csv('/kaggle/input/meta-kaggle/Users.csv')
df_team_memberships = pd.read_csv('/kaggle/input/meta-kaggle/TeamMemberships.csv', parse_dates=['RequestDate'])
df_submissions = pd.read_csv('/kaggle/input/meta-kaggle/Submissions.csv', parse_dates=['SubmissionDate', 'ScoreDate'])
df_forums = pd.read_csv('/kaggle/input/meta-kaggle/Forums.csv')
df_forum_topics = pd.read_csv('/kaggle/input/meta-kaggle/ForumTopics.csv')
df_forum_messages = pd.read_csv('/kaggle/input/meta-kaggle/ForumMessages.csv', parse_dates=['PostDate'])
df_forum_message_votes = pd.read_csv('/kaggle/input/meta-kaggle/ForumMessageVotes.csv')

In [None]:
# data preprocessing

df_competitions = df_competitions.query('Slug == "bms-molecular-translation"')
competition_id = df_competitions.iloc[0].Id
df_teams = df_teams.query('CompetitionId == @competition_id and PublicLeaderboardRank == PublicLeaderboardRank')
df_team_memberships = df_team_memberships.query('TeamId in @df_teams.Id')
df_users = df_users.query('Id in @df_team_memberships.UserId')
df_submissions = df_submissions.query('TeamId in @df_teams.Id')
df_forums = df_forums.query('Id in @df_competitions.ForumId')
df_forum_topics = df_forum_topics.query('ForumId in @df_forums.Id')
df_forum_messages = df_forum_messages.query('ForumTopicId in @df_forum_topics.Id')
df_forum_message_votes = df_forum_message_votes.query('ForumMessageId in @df_forum_messages.Id')
df_medals = duckdb.query("""select coalesce(cast(medal as int), -1) as medal, min(PrivateScoreFullPrecision) min_medal_score, max(PrivateScoreFullPrecision) as max_medal_score
    from df_teams t
    join df_submissions s on t.PrivateLeaderboardSubmissionId = s.Id
    group by medal
    order by medal""").to_df()
df_medals['color'] = ['lightskyblue', 'gold', 'silver', 'brown']

In [None]:
print('Total Submissions:', len(df_submissions))
print('Total Users:', len(df_users))
print('Total Teams:', len(df_teams))
print('Total Forum Topics:', len(df_forum_topics))
print('Total Forum Messages:', len(df_forum_messages))

In [None]:
df = duckdb.query("""select PrivateScoreFullPrecision as score, color
from df_teams t
join df_submissions s on t.PrivateLeaderboardSubmissionId = s.Id
join df_medals m on s.PrivateScoreFullPrecision between m.min_medal_score and m.max_medal_score
order by score
""").to_df()
plt.figure(figsize=(6, 8))
sns.barplot(x=df.score, y=df.index, orient='h', dodge=False, palette=df.color)
plt.ylim(884, -10)
plt.xlim(0, 10)
plt.gca().xaxis.set_major_locator(mpl.ticker.IndexLocator(1, 0))
plt.gca().yaxis.set_major_locator(mpl.ticker.MultipleLocator(50));
plt.gca().xaxis.grid(color='lightgrey', linestyle='--', linewidth=1)
plt.gca().set_axisbelow(True)
plt.xlabel('Levenshtein Distance')
plt.title('Private Leaderboard (All)')
plt.ylabel('Rank');

In [None]:
%config InlineBackend.figure_format = 'svg'
df = duckdb.query("""select PrivateScoreFullPrecision as score, color
from df_teams t
join df_submissions s on t.PrivateLeaderboardSubmissionId = s.Id
join df_medals m on s.PrivateScoreFullPrecision between m.min_medal_score and m.max_medal_score
order by score
limit 100
""").to_df()
plt.figure(figsize=(6, 8))
sns.barplot(x=df.score, y=df.index, orient='h', dodge=False, palette=df.color)
plt.ylim(100, -1)
# plt.xlim(0, 10)
plt.gca().xaxis.set_major_locator(mpl.ticker.IndexLocator(1, 0))
plt.gca().yaxis.set_major_locator(mpl.ticker.MultipleLocator(50));
plt.gca().xaxis.grid(color='lightgrey', linestyle='--', linewidth=1)
plt.gca().set_axisbelow(True)
plt.xlabel('Levenshtein Distance')
plt.title('Private Leaderboard (Top 100)')
plt.ylabel('Rank');

In [None]:
df = duckdb.query("""select SubmissionDate as date, count(*) as count 
from df_submissions 
group by SubmissionDate 
order by Date""").to_df()
sns.barplot(data=df, x='date', y='count');
plt.gca().set_xticklabels(labels=df.date.dt.strftime('%b %d'))
plt.gca().xaxis.set_major_locator(mpl.dates.WeekdayLocator(interval=2))
plt.title('Submissions per day')
plt.xlabel('Date')
plt.ylabel('Count');

In [None]:
df = duckdb.query("""select SubmissionDate as date, count(distinct TeamId) as count
from df_submissions 
group by SubmissionDate 
order by Date""").to_df()
sns.barplot(data=df, x='date', y='count');
plt.gca().set_xticklabels(labels=df.date.dt.strftime('%b %d'))
plt.gca().xaxis.set_major_locator(mpl.dates.WeekdayLocator(interval=2))
plt.title('Teams submitted per day')
plt.xlabel('Date')
plt.ylabel('Count');

In [None]:
df = duckdb.query("""
with t1 as (select SubmissionDate as date, min(PrivateScoreFullPrecision) over(order by SubmissionDate) as score 
from df_submissions)
select distinct * from t1
order by date
""").to_df()
sns.lineplot(data=df, x='date', y='score')
plt.axhline(y=df_medals.iloc[1].max_medal_score, color=df_medals.iloc[1].color, linestyle='--', lw=1);
plt.axhline(y=df_medals.iloc[2].max_medal_score, color=df_medals.iloc[2].color, linestyle='--', lw=1);
plt.axhline(y=df_medals.iloc[3].max_medal_score, color=df_medals.iloc[3].color, linestyle='--', lw=1);
plt.ylim(0, 3)
plt.gca().xaxis.set_major_locator(mpl.dates.WeekdayLocator(interval=2));
plt.gca().xaxis.set_major_formatter(mpl.dates.DateFormatter('%b %d'))
plt.xlabel('Date')
plt.ylabel('Score')
plt.title('Best score over time');

In [None]:
df = duckdb.query("""
select SubmissionDate as date, PrivateScoreFullPrecision as score, color
from df_submissions  s
join df_medals m on s.PrivateScoreFullPrecision between m.min_medal_score and m.max_medal_score
""").to_df()
sns.scatterplot(data=df, x='date', y='score', hue=df.color, palette=df_medals.color.tolist(), hue_order=['lightskyblue', 'gold', 'silver', 'brown'], s=5, alpha=0.5, edgecolor=None)
plt.gca().xaxis.set_major_locator(mpl.dates.WeekdayLocator(interval=2))
plt.xlim(datetime.date(2021, 3, 15), datetime.date(2021, 6, 7))
plt.ylim(0, 5)
plt.legend([],[], frameon=False)
plt.xlabel('Date')
plt.ylabel('Levenshtein Distance')
plt.title('Submission scores over time');

In [None]:
df = duckdb.query("""
select cast(PrivateLeaderboardRank as int) || '. ' || TeamName as team, PrivateScoreFullPrecision as score, color
from df_submissions s
join df_medals m on s.PrivateScoreFullPrecision between m.min_medal_score and m.max_medal_score
join df_teams t on s.TeamId = t.Id
where PrivateLeaderboardRank <= 50
order by PrivateLeaderboardRank
""").to_df()
plt.figure(figsize=(8, 10))
sns.stripplot(data=df, x='score', y='team', hue=df.color, palette=df_medals.color.tolist(), hue_order=['lightskyblue', 'gold', 'silver', 'brown'], s=5, edgecolor=None, alpha=0.5)
plt.xlim(0, 5)
plt.gca().tick_params(axis='both', which='major', labelsize=8)
plt.legend([],[], frameon=False)
plt.xlabel('Levenshtein distance')
plt.title('Score distribution by team');

In [None]:
df = duckdb.query("""
select PerformanceTier, count(*) as count from df_users
group by PerformanceTier
order by PerformanceTier
""").to_df()
df['PerformanceTier'] = df['PerformanceTier'].map({0: 'Novice', 1: 'Contributor', 2: 'Expert', 3: 'Master', 4: 'Grandmaster'})
df = df.set_index('PerformanceTier')

df['count'].plot.pie(colors=['#55CD97', '#21BEFF', '#96508E', '#F76629', '#DEAD24'], autopct=lambda p: f'{p*df["count"].sum() / 100 :.0f}');
plt.ylabel('');
plt.title('Participants by level');

In [None]:
df = duckdb.query("""
select UserId as user_id, count(*) over (partition by TeamId) > 1 as is_in_team from df_team_memberships
""").to_df()
df.is_in_team.map({True: 'In team', False: 'Solo'}).value_counts().plot.pie(autopct=lambda p: f'{p*len(df) / 100 :.0f} users');
plt.title('Solo or in team')
plt.ylabel('');

In [None]:
df = duckdb.query("""
with t1 as (select RequestDate as date, row_number() over (partition by TeamId order by RequestDate) as rn from df_team_memberships)
select date, sum(cast(rn > 1 as int)) as count from t1
group by date
order by date
""").to_df()
sns.barplot(data=df, x='date', y='count');
plt.gca().set_xticklabels(labels=df.date.dt.strftime('%b %d'))
plt.gca().xaxis.set_major_locator(mpl.dates.WeekdayLocator(interval=2));
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Team merges per day');

In [None]:
df = duckdb.query("""select date_trunc('day', PostDate) as date, count(*) as count 
from df_forum_messages
group by date 
order by date""").to_df()
sns.barplot(data=df, x='date', y='count');
plt.gca().set_xticklabels(labels=df.date.dt.strftime('%b %d'))
plt.gca().xaxis.set_major_locator(mpl.dates.WeekdayLocator(interval=2))
plt.title('Forum messages per day')
plt.xlabel('Date')
plt.ylabel('Count');

In [None]:
df = duckdb.query("""
with votes_received as (
select ToUserId as user_id, count(*) as count
from df_forum_message_votes
group by ToUserId
), votes_given as (
select FromUserId as user_id, count(*) as count
from df_forum_message_votes
group by FromUserId
),
message_stats as (
select u.Id as user_id, u.DisplayName, count(*) as count, 
sum(cast(medal = 1 as int) ) as gold_count,
sum(cast(medal = 2 as int) ) as silver_count,
sum(cast(medal = 3 as int) ) as bronze_count
from df_forum_messages m
join df_users u on m.PostUserId = u.Id
group by u.Id, u.DisplayName
)
select DisplayName as "User", cast(ms.count as int) as "Messages Posted", 
    cast(coalesce(vr.count, 0) as int) as "Votes Received", cast(coalesce(vg.count, 0) as int) as "Votes Given",
    cast(coalesce(ms.gold_count, 0) as int) as "Gold",
    cast(coalesce(ms.silver_count, 0) as int) as "Silver",
    cast(coalesce(ms.bronze_count, 0) as int) as "Bronze"
from message_stats ms
left join votes_received vr on vr.user_id = ms.user_id
left join votes_given vg on vg.user_id = ms.user_id
order by "Votes Received" desc
""").to_df()
df.head(50).style.bar().set_caption('Forum statistics').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('font-size', '20pt')
    ]
}])