This is a rough script to show how I made the MoneyBoard dataset. My goal was to capture around 90% of the total prize money. Please see the dataset for more info.

In [None]:
%%bash

pip install asks trio gazpacho  # cool gen2 libraries

In [None]:
from pathlib import Path
import re
import ast

import asks
import numpy as np
import pandas as pd
import trio
import gazpacho as gzp

In [None]:
variations1 = [
    '(?<=rize:\s\$)',
    '(?<=rize\s\-\s\$)',
]
variations2 = [  # so many formats...
    '(?<=lace:\s{2}\$)',
    '(?<=lace\s\-\s\$)',
    '(?<=lace\s\-\s\$\s)',
    '(?<=lace\s\-\s{2}\$)',
    '(?<=lace\s\-\s{3}\$)',
    '(?<=lace\s\-\s\$\s)',
    '(?<=u003e\$)',
    '(?<=u003e\s\$)',
    '(?<=u003e\s\-\s\$\s)',
    '(?<=lace\*{2}\s\-\s{3}\$\s)',
    '(?<=lace\*{2}\s\-\s{2}\$\s)', 
]
pattern1 = re.compile(fr'(({"|".join(variations1)})[0-9]+,?[0-9]+)')
pattern2 = re.compile(fr'(({"|".join(variations2)})[0-9]+,?[0-9]+)')
pattern2  # compiled fr strings are the bomb


In [None]:

async def fetch_page(s, slug):
    url = f"https://www.kaggle.com/c/{slug}"
    r = await s.get(url)
    return r.content.decode("utf-8", errors="ignore")


async def find_text(s, slug, prize_list):
    content = await fetch_page(s, slug)
    soup = gzp.Soup(content)
    blob = soup.find('script', {'class': 'kaggle-component'})[-1]
    prize_text = pattern1.findall(blob.text)
    if not prize_text:
        prize_text = pattern2.findall(blob.text)
    prizes = [int(t[0].replace(",", "")) for t in prize_text]
    prize_list.append((slug, prizes))

    
async def get_prizes(comps):
    dname = "https://www.kaggle.com"
    s = asks.sessions.Session(dname, connections=16)
    prize_list = []
    async with trio.open_nursery() as n:
        for slug in comps.Slug:
            n.start_soon(find_text, s, slug, prize_list)
    return prize_list


cols = ['Id', "Slug", "DeadlineDate", "RewardQuantity", "NumPrizes"]
comps = pd.read_csv('../input/meta-kaggle/Competitions.csv', usecols=cols, parse_dates=['DeadlineDate'])
comps = comps[comps.Slug.ne('flight2-milestone') & comps.Slug.ne('flight2-main')]
comps_new = comps.copy()[comps.DeadlineDate.dt.year.ge(2017) & comps.RewardQuantity.ge(1000.0)]

In [None]:
%%time

# Fast fetch and parse
prize_list = trio.run(get_prizes, comps_new)
print(f"{len(comps_new)} pages fetched.")


In [None]:
# Add prizes to df
prize_df = pd.DataFrame(prize_list, columns=['Slug', 'TeamPrize'])
comps_new = comps_new.merge(prize_df, on='Slug').sort_values('RewardQuantity', ascending=False)

# Error checking
comps_new['ScrapedCount'] = comps_new.TeamPrize.apply(lambda x: len(x))
comps_new['CountMatch'] = comps_new.NumPrizes == comps_new.ScrapedCount
comps_new['ScrapedSum'] = comps_new.TeamPrize.apply(lambda x: sum(np.array(x)))
comps_new['SumMatch'] = comps_new.RewardQuantity == comps_new.ScrapedSum
print(f"{(comps_new.CountMatch & comps_new.SumMatch).sum()} of {len(comps_new)} look good")
comps_new

In [None]:
# Take the biggest prizes for older competitions
comps_old = comps.copy()[comps.DeadlineDate.dt.year.lt(2017) & comps.RewardQuantity.ge(1000.0)
    ].sort_values('RewardQuantity', ascending=False
    ).assign(QuantSum = lambda x: x.RewardQuantity.cumsum()
    )
comps_old

In [None]:
%%time

# Fast fetch and parse
prize_list = trio.run(get_prizes, comps_old)
print(f"{len(comps_old)} pages fetched.")

# Add prizes to df
prize_df = pd.DataFrame(prize_list, columns=['Slug', 'TeamPrize'])
comps_old = comps_old.merge(prize_df, on='Slug').sort_values('RewardQuantity', ascending=False)

# Cut here
comps_old1 = comps_old.copy().iloc[:25, :]  # $3+M

# Error checking
comps_old1['ScrapedCount'] = comps_old1.TeamPrize.apply(lambda x: len(x))
comps_old1['ScrapedSum'] = comps_old1.TeamPrize.apply(lambda x: sum(np.array(x)))

comps_old1['CountMatch'] = comps_old1.NumPrizes == comps_old1.ScrapedCount
comps_old1['SumMatch'] = comps_old1.RewardQuantity == comps_old1.ScrapedSum
print(f"{(comps_old1.CountMatch & comps_old1.SumMatch).sum()} of {len(comps_old1)} look good")

There were still a lot of mismatches for new and old. After digging into it I found some were acutally good but many were not. I ended up exporting the frame and manually fixing them.

In [None]:
comps_most = comps_new.append(comps_old1)
comps_most['Url'] = f"https://www.kaggle.com/c/" + comps_most.Slug
# comps_most.to_csv('comps_most.csv', index=False)

In [None]:
# New file with mostly correct entries
comps_refined = pd.read_csv('../input/meta-kaggle-moneyboard/feeder_files/ml_comps_refined.csv',
    converters={'TeamPrize': ast.literal_eval}, parse_dates=['DeadlineDate'])
comps_refined.sort_values('DeadlineDate')


Offline analysis shows I'm still short of itemizing 90% of the total value. Going again at the older comps to get closer.

In [None]:
# Pick up at the previous cut
comps_old2 = comps_old.copy().iloc[25:, :]

# Error checking
comps_old2['ScrapedCount'] = comps_old2.TeamPrize.apply(lambda x: len(x))
comps_old2['ScrapedSum'] = comps_old2.TeamPrize.apply(lambda x: sum(np.array(x)))

comps_old2['CountMatch'] = comps_old2.NumPrizes == comps_old2.ScrapedCount
comps_old2['SumMatch'] = comps_old2.RewardQuantity == comps_old2.ScrapedSum
print(f"{(comps_old2.CountMatch & comps_old2.SumMatch).sum()} of {len(comps_old2)} look good")

In [None]:
# Take all good matches over $10k
comps_old2_qual = comps_old2.copy(
    ).loc[comps_old2.CountMatch & comps_old2.SumMatch \
    & comps_old2.RewardQuantity.ge(10000), :
    ].assign(QuantSum = lambda x: x.RewardQuantity.cumsum()
    )
comps_old2_qual= comps_old2_qual[comps_old2_qual.QuantSum >= 10000]
comps_old2_qual

In [None]:

cols = ['Id', 'Slug', 'DeadlineDate', 'TeamPrize']
comps_old2_qual = comps_old2_qual[cols]
comps_refined = comps_refined.append(comps_old2_qual)
comps_refined

In [None]:
comps_tidy = comps_refined.explode('TeamPrize', ignore_index=True)
not_founds = comps_tidy.loc[comps_tidy.TeamPrize.isna()]
display(not_founds)

comps_tidy = comps_tidy.assign(LB_rank = comps_tidy.groupby('Id').cumcount() + 1
    ).drop(not_founds.index
    )
comps_tidy[:8]

In [None]:
# Add people
team_cols = ['Id', 'CompetitionId', 'PrivateLeaderboardRank']
teams = pd.read_csv('../input/meta-kaggle/Teams.csv', usecols=team_cols)
members = pd.read_csv('../input/meta-kaggle/TeamMemberships.csv', usecols=['TeamId', 'UserId'])
users = pd.read_csv('../input/meta-kaggle/Users.csv', usecols=['Id', 'UserName', 'DisplayName'])

comps_matched = comps_tidy.merge(teams, how='left', left_on=['Id', 'LB_rank'],
    right_on=['CompetitionId', 'PrivateLeaderboardRank'], suffixes=('_comp', '_team')
    ).merge(members, how='left', left_on='Id_team', right_on='TeamId'
    ).merge(users, how='left', left_on='UserId', right_on='Id'
    )


In [None]:
# Add analytics comps and data science for good
analytics = pd.read_csv('../input/meta-kaggle-moneyboard/feeder_files/analytics_comps.csv',
    parse_dates=['DeadlineDate'])
ds4good = pd.read_csv('../input/meta-kaggle-moneyboard/feeder_files/ds4good.csv',
    parse_dates=['DeadlineDate'])

added = analytics.append(ds4good)
added = added.merge(users[['Id', 'UserName', 'DisplayName']], how='left',
    on='UserName').rename(columns={'Id': 'UserId'})
print(f"Added {added.drop_duplicates('TeamId').TeamPrize.sum():,.0f} in prizes.")

In [None]:
comps_final = comps_matched.fillna('deletedUser'
    ).append(added, ignore_index=True
    )
comps_final['TeamSize'] = comps_final.groupby('TeamId')['UserId'].transform('size')
comps_final['UserPrize'] = comps_final.eval('TeamPrize/TeamSize').round(0).astype(int)

In [None]:
# Check coverage
total = comps.RewardQuantity.sum() + 400000
itemized = comps_final.UserPrize.sum()
print(f"Itemized {itemized/total:.3f} of ${total:,.0f}")

In [None]:
# Clean up
cols = ['CompetitionId', 'Slug', 'DeadlineDate', 'PrivateLeaderboardRank', 'TeamPrize', 'TeamId', 
    'TeamSize', 'UserPrize', 'UserId', 'UserName', 'DisplayName']
sort_cols = ['DeadlineDate', 'CompetitionId', 'PrivateLeaderboardRank', 'UserId']
comps_final = comps_final[cols].assign(
    PrivateLeaderboardRank=comps_final.PrivateLeaderboardRank.astype(int)).sort_values(sort_cols)

comps_final.to_csv('competition_prizewinners.csv', index=False)
comps_final

In [None]:
# comps_final.groupby('UserId').agg(
#     Kaggler = ('DisplayName', 'first'),
#     Earnings = ('UserPrize', 'sum')
#     ).sort_values('Earnings', ascending=False)[:60]