In [None]:
import shutil
import os
import urllib.request

# Get url from https://www.worldcubeassociation.org/export/results
url = input()

# If url contains .sql, replace with .tsv
url = url.replace('.sql', '.tsv')

print("Download")
urllib.request.urlretrieve(url, "WCA_export.zip")

if os.path.exists('WCA_export'):
    shutil.rmtree('WCA_export')

print("Unzip")
shutil.unpack_archive('WCA_export.zip', 'WCA_export')

print("Rename")
for filename in os.listdir('WCA_export'):
    # Remove WCA_export_ from the filename
    new_name = filename.replace('WCA_export_', '')
    os.rename(f'WCA_export/{filename}', f'WCA_export/{new_name}')

print("Remove unnecessary")

def remove_if_exists(path):
    if os.path.exists(path):
        os.remove(path)

remove_if_exists('./WCA_export/championships.tsv')
remove_if_exists('./WCA_export/eligible_country_iso2s_for_championship.tsv')
remove_if_exists('./WCA_export/formats.tsv')
remove_if_exists('./WCA_export/round_types.tsv')
remove_if_exists('./WCA_export/scrambles.tsv')

# Legacy export names
remove_if_exists('./WCA_export/Formats.tsv')
remove_if_exists('./WCA_export/RoundTypes.tsv')
remove_if_exists('./WCA_export/Scrambles.tsv')

In [None]:
import pandas as pd
import os
import shutil

export_dir = 'WCA_export'

# Copy staff.tsv to export_dir if it exists locally
local_staff_candidates = ['staff.tsv', 'Staff.tsv']
export_staff_path = os.path.join(export_dir, 'staff.tsv')
for candidate in local_staff_candidates:
    if os.path.exists(candidate):
        shutil.copy(candidate, export_staff_path)
        break

filenames = os.listdir(export_dir)
filenames = [f for f in filenames if f.endswith('.tsv')]

def normalize_tablename(name):
    lower = name.lower()
    legacy_map = {
        'rankssingle': 'ranks_single',
        'ranksaverage': 'ranks_average',
        'roundtypes': 'round_types',
    }
    return legacy_map.get(lower, lower)

dfs = {}

def normalize_columns(tablename, df):
    # Convert snake_case export columns to existing camelCase expectations
    if tablename in ['ranks_single', 'ranks_average']:
        df = df.rename(columns={
            'event_id': 'eventId',
            'person_id': 'personId',
            'country_id': 'countryId',
            'continent_id': 'continentId',
            'world_rank': 'worldRank',
            'continent_rank': 'continentRank',
            'country_rank': 'countryRank',
        })
    elif tablename == 'results':
        df = df.rename(columns={
            'competition_id': 'competitionId',
            'round_type_id': 'roundTypeId',
            'event_id': 'eventId',
            'person_id': 'personId',
            'person_country_id': 'personCountryId',
            'person_name': 'personName',
            'format_id': 'formatId',
            'regional_single_record': 'regionalSingleRecord',
            'regional_average_record': 'regionalAverageRecord',
        })
    elif tablename == 'persons':
        df = df.rename(columns={
            'country_id': 'countryId',
            'continent_id': 'continentId',
        })
    elif tablename == 'countries':
        df = df.rename(columns={'continent_id': 'continentId'})
    return df

for filename in filenames:
    tablename = normalize_tablename(filename.split('.')[0])
    df = pd.read_csv(f'{export_dir}/{filename}', delimiter='\t')
    dfs[tablename] = normalize_columns(tablename, df)
    print(filename)

In [None]:
print('Remove unnecessary columns')

# Normalize persons columns for WCA export v2
if 'id' in dfs['persons'].columns and 'wca_id' not in dfs['persons'].columns:
    dfs['persons'] = dfs['persons'].rename(columns={'id': 'wca_id'})
if 'subid' in dfs['persons'].columns and 'sub_id' not in dfs['persons'].columns:
    dfs['persons'] = dfs['persons'].rename(columns={'subid': 'sub_id'})

dfs['persons'] = dfs['persons'].drop(columns=['sub_id'], errors='ignore')

results_drop_columns = [
    'personName', 'formatId', 'personCountryId',
    'person_name', 'format_id', 'person_country_id',
    'value1', 'value2', 'value3', 'value4', 'value5',
]
dfs['results'] = dfs['results'].drop(columns=results_drop_columns, errors='ignore')
dfs['results'].head()

In [None]:
print('Cast event ids to string')
dfs['events'].id = dfs['events'].id.astype(str)
dfs['ranks_single'].eventId = dfs['ranks_single'].eventId.astype(str)
dfs['ranks_average'].eventId = dfs['ranks_average'].eventId.astype(str)

In [None]:
print('Handle duplicate persons')

# If a person has moved countries, then they could have multiple entries
dfs['persons'].drop_duplicates('wca_id', inplace=True)

In [None]:
print('Populate country ids')
country_ids = dfs['persons'][['wca_id', 'countryId']]

dfs['ranks_single'] = dfs['ranks_single'].merge(country_ids, left_on='personId', right_on='wca_id').drop('wca_id', axis=1)
dfs['ranks_average'] = dfs['ranks_average'].merge(country_ids, left_on='personId', right_on='wca_id').drop('wca_id', axis=1)
dfs['ranks_single']

In [None]:
print('Populate continent ids')
continent_ids = dfs['countries'][['id', 'continentId']]

dfs['ranks_single'] = dfs['ranks_single'].merge(continent_ids, left_on='countryId', right_on='id').drop('id', axis=1)
dfs['ranks_average'] = dfs['ranks_average'].merge(continent_ids, left_on='countryId', right_on='id').drop('id', axis=1)
dfs['persons'] = dfs['persons'].merge(continent_ids, left_on='countryId', right_on='id', suffixes=('', '_drop')).drop('id_drop', axis=1, errors='ignore')

In [None]:
print('Populate names')
names = dfs['persons'][['wca_id', 'name']]

dfs['ranks_single'] = dfs['ranks_single'].merge(names, left_on='personId', right_on='wca_id').drop('wca_id', axis=1)
dfs['ranks_average'] = dfs['ranks_average'].merge(names, left_on='personId', right_on='wca_id').drop('wca_id', axis=1)

In [None]:
print('Calculate max ranks')
single_world_maxes = dfs['ranks_single'].groupby('eventId').max()['worldRank']
average_world_maxes = dfs['ranks_average'].groupby('eventId').max()['worldRank']

single_continent_maxes = dfs['ranks_single'].groupby(['continentId', 'eventId']).max()['continentRank']
average_continent_maxes = dfs['ranks_average'].groupby(['continentId', 'eventId']).max()['continentRank']

single_country_maxes = dfs['ranks_single'].groupby(['countryId', 'eventId']).max()['countryRank']
average_country_maxes = dfs['ranks_average'].groupby(['countryId', 'eventId']).max()['countryRank']

In [None]:
# People who switched nationality may have rank of 0 for continentRank and countryRank.
# Set these values to the max possible value for that event.

def fill_zero_ranks(df, continent_maxes, country_maxes):
  continent_lookup = df[['continentId', 'eventId']].apply(tuple, axis=1).map(continent_maxes)
  df.loc[df['continentRank'] == 0, 'continentRank'] = continent_lookup[df['continentRank'] == 0].values

  country_lookup = df[['countryId', 'eventId']].apply(tuple, axis=1).map(country_maxes)
  df.loc[df['countryRank'] == 0, 'countryRank'] = country_lookup[df['countryRank'] == 0].values

  return df

dfs['ranks_single'] = fill_zero_ranks(dfs['ranks_single'], single_continent_maxes, single_country_maxes)
dfs['ranks_average'] = fill_zero_ranks(dfs['ranks_average'], average_continent_maxes, average_country_maxes)

print('There should be very few rows where continentRank or countryRank is 0')
dfs['ranks_single'][dfs['ranks_single'].countryRank == 0]

In [None]:
import itertools

def calculate_sum_of_ranks(rank_type, events, world_maxes, continent_maxes, country_maxes):

    person_ids = dfs['persons'].wca_id.unique()
    combinations = pd.DataFrame(list(itertools.product(person_ids, events)), columns=['personId', 'eventId'])

    ranks_table = dfs[f'ranks_{rank_type}']

    # Make a row for every person and every event. If person has no result, their rank is NaN
    ranks_all = combinations \
        .merge(ranks_table[['personId', 'eventId', 'worldRank', 'continentRank', 'countryRank']], on=['personId', 'eventId'], how='left') \
        .merge(dfs['persons'][['wca_id', 'countryId', 'continentId']], left_on='personId', right_on='wca_id', how='left')

    ranks_all['worldRank'] = ranks_all['worldRank'].fillna(ranks_all['eventId'].map(world_maxes))
    ranks_all['continentRank'] = ranks_all['continentRank'].fillna(ranks_all.set_index(['continentId', 'eventId']).index.map(continent_maxes).to_series(index=ranks_all.index))
    ranks_all['countryRank'] = ranks_all['countryRank'].fillna(ranks_all.set_index(['countryId', 'eventId']).index.map(country_maxes).to_series(index=ranks_all.index))

    sor = ranks_all.groupby('personId')[['worldRank', 'continentRank', 'countryRank']].sum()
    sor = sor.reset_index()
    sor = sor.rename(columns={
        'worldRank': f'worldSor{rank_type.title()}',
        'continentRank': f'continentSor{rank_type.title()}',
        'countryRank': f'countrySor{rank_type.title()}',
    })
    return sor

In [None]:
single_events = ['222', '333', '333bf', '333fm', '333mbf', '333oh', '444', '444bf', '555', '555bf', '666', '777', 'clock', 'minx', 'pyram', 'skewb', 'sq1']

# Same as single_events but no 333mbf
average_events = ['222', '333', '333bf', '333fm', '333oh', '444', '444bf', '555', '555bf', '666', '777', 'clock', 'minx', 'pyram', 'skewb', 'sq1']

print('Single sum of ranks')
single_sor = calculate_sum_of_ranks('single', single_events, single_world_maxes, single_continent_maxes, single_country_maxes)
print('Average sum of ranks')
average_sor = calculate_sum_of_ranks('average', average_events, average_world_maxes, average_continent_maxes, average_country_maxes)

In [None]:
dfs['persons'] = dfs['persons'].merge(single_sor, left_on='wca_id', right_on='personId').drop('personId', axis=1)
dfs['persons'] = dfs['persons'].merge(average_sor, left_on='wca_id', right_on='personId').drop('personId', axis=1)
dfs['persons'].head()

In [None]:
# Get the min rank instead of rank 1, because sometimes rank 1 doesn't exist (idk why)
best_singles = dfs['ranks_single'].sort_values('worldRank').drop_duplicates('eventId')[['eventId', 'best']].rename(columns={'best': 'single'})
best_averages = dfs['ranks_average'].sort_values('worldRank').drop_duplicates('eventId')[['eventId', 'best']].rename(columns={'best': 'average'})

world_bests = best_singles.merge(best_averages, on='eventId', how='outer')
world_bests = {eventId: (single, average) for eventId, single, average in world_bests.values}
world_bests

In [None]:
def group_dict(d):
    '''
    Example input: {
        ('a', 'b'): 1,
        ('a', 'c'): 2,
    }

    Example output: {
        'a': {
            'b': 1,
            'c': 2,
        },
    }
    '''
    output = {}
    for k in d:
        if k[0] not in output:
            output[k[0]] = {}
        
        output[k[0]][k[1]] = d[k]
    return output

best_singles = dfs['ranks_single'].sort_values('continentRank').drop_duplicates(['eventId', 'continentId'])[['eventId', 'best', 'continentId']].rename(columns={'best': 'single'})
best_averages = dfs['ranks_average'].sort_values('continentRank').drop_duplicates(['eventId', 'continentId'])[['eventId', 'best', 'continentId']].rename(columns={'best': 'average'})

continent_bests = best_singles.merge(best_averages, on=['eventId', 'continentId'], how='outer').drop_duplicates()
continent_bests = {(continentId, eventId): (single, average) for eventId, single, continentId, average in continent_bests.values}
continent_bests = group_dict(continent_bests)

best_singles = dfs['ranks_single'].sort_values('countryRank').drop_duplicates(['eventId', 'countryId'])[['eventId', 'best', 'countryId']].rename(columns={'best': 'single'})
best_averages = dfs['ranks_average'].sort_values('countryRank').drop_duplicates(['eventId', 'countryId'])[['eventId', 'best', 'countryId']].rename(columns={'best': 'average'})

country_bests = best_singles.merge(best_averages, on=['eventId', 'countryId'], how='outer').drop_duplicates()
country_bests = {(continentId, eventId): (single, average) for eventId, single, continentId, average in country_bests.values}
country_bests = group_dict(country_bests)

In [None]:
def build_rank_dict(tablename):
    d = {}
    for row in dfs[tablename][['personId', 'eventId', 'best']].values:
        personId, eventId, best = row

        if personId not in d:
            d[personId] = {}

        d[personId][eventId] = best
    
    # Make sure every person at least has an empty object
    for id in dfs['persons']['wca_id']:
        if id not in d:
            d[id] = {}

    return d

print('Build rank dicts')
single_dict = build_rank_dict('ranks_single')
average_dict = build_rank_dict('ranks_average')

In [None]:
import math

def mbldScore(value):
    if not value:
        return 0
    seconds = math.floor(value / 100) % 1e5
    points = 99 - (math.floor(value / 1e7) % 100)
    centiseconds = None if seconds == 99999 else seconds * 100
    proportionOfHourLeft = 1 - centiseconds / 360000
    score = points + proportionOfHourLeft
    return max(score, 0)

def get_kinch_score(personId, bests, key):
    if key:
        if key in bests:
            bests = bests[key]
        else:
            # This edge case can occur if, for example, a person moves to a new country that has no results.
            # This occurred for wca id 2018YEDD01 who moved to Barbados!
            return 0
    scores = []

    # Handle 333mbf
    single = single_dict[personId].get("333mbf")
    average = average_dict[personId].get("333mbf")
    bestSingle, bestAverage = bests["333mbf"] if "333mbf" in bests else (None, None)

    mbldPersonal = mbldScore(single)
    mbldRecord = mbldScore(bestSingle)

    if mbldRecord:
        scores.append(mbldPersonal / mbldRecord * 100)
    else:
        # If nobody has mbld, use 100
        scores.append(0)

    # For these events, use better between single and average
    for eventId in ["333fm", "333bf", "444bf", "555bf"]:
        single = single_dict[personId].get(eventId)
        average = average_dict[personId].get(eventId)
        bestSingle, bestAverage = bests[eventId] if eventId in bests else (None, None)

        if not single and not average:
            scores.append(0)
        elif not bestSingle or not bestAverage:
            # This can happen if a person has multiple countryIds and one of the countries has no result for the event.
            scores.append(100)
        elif not average:
            # If no average, use single
            scores.append(bestSingle / single * 100)
        else:
            # # If there is an average, use the better of the two
            scores.append(max(
                bestSingle / single * 100,
                bestAverage / average * 100
            ))

    # For these events, use average
    for eventId in ['222', '333', '333oh', '444', '555', '666', '777', 'clock', 'minx', 'pyram', 'skewb', 'sq1']:
        single = single_dict[personId].get(eventId)
        average = average_dict[personId].get(eventId)
        bestSingle, bestAverage = bests[eventId] if eventId in bests else (None, None)

        if not average:
            scores.append(0)
        elif not bestAverage:
            # This can happen if a person has multiple countryIds and one of the countries has no result for the event.
            scores.append(100)
        else:
            scores.append(bestAverage / average * 100)

    avgScore = sum(scores) / len(scores)
    if personId == '2011BANS02':
        print("scores", scores)
        print("avgScore", avgScore)
    return avgScore

In [None]:
persons = dfs['persons']

print('World kinch')
persons['worldKinch'] = persons.apply(lambda row: get_kinch_score(row['wca_id'], world_bests, None), axis=1)

print('Continent kinch')
persons['continentKinch'] = persons.apply(lambda row: get_kinch_score(row['wca_id'], continent_bests, row['continentId']), axis=1)

print('Country kinch')
persons['countryKinch'] = persons.apply(lambda row: get_kinch_score(row['wca_id'], country_bests, row['countryId']), axis=1)

persons.head()

In [None]:
print('Populate startDate and endDate')

# Format is yyyy-mm-dd
# Pad month and day with zeros
dfs['competitions']['startDate'] = dfs['competitions'].apply(lambda row: f"{row['year']}-{str(row['month']).zfill(2)}-{str(row['day']).zfill(2)}", axis=1)
dfs['competitions']['endDate'] = dfs['competitions'].apply(lambda row: f"{row['year']}-{str(row['end_month']).zfill(2)}-{str(row['end_day']).zfill(2)}", axis=1)

In [None]:
print('Calculate birthdays')

# Get competition data
comps = dfs['results'][['competitionId', 'personId']].drop_duplicates()
comps = comps.merge(dfs['competitions'][['id', 'startDate']], left_on='competitionId', right_on='id').drop('id', axis=1)
comps = comps.merge(dfs['persons'][['wca_id', 'name']], left_on='personId', right_on='wca_id').drop('wca_id', axis=1)
comps = comps.sort_values('startDate')

# Get first comp for each person
first_comps = {}
for row in comps.values:
    personId = row[1]

    if personId in first_comps:
        continue

    first_comps[personId] = row

def sort_dict(d, keys):
    output = {}
    for key in keys:
        output[key] = d[key]
    return output

# Get persons in order of rank
persons = dfs['ranks_single'].sort_values('worldRank')['personId'].unique()

first_comps = sort_dict(first_comps, persons)

dfs['birthdays'] = pd.DataFrame(first_comps.values(), columns=['competitionId', 'personId', 'date', 'name'])
dfs['birthdays'].head()

In [None]:
import json

with open(f'{export_dir}/metadata.json', 'r') as f:
    data = json.loads(f.read())

    dfs['miscellaneous'] = pd.DataFrame({
        'key': ['export_date'],
        'value': [data['export_date'][0:10]], # Only first 10 chars for yyyy-mm-dd
    })

dfs['miscellaneous']

## Convert to old format for backwards compatibility

Changelog can be found at: https://www.worldcubeassociation.org/export/results

In [None]:
results = dfs["results"]
attempts = dfs["result_attempts"]

# Pivot attempts back into value1â€“value5
attempts_wide = (
    attempts
    .pivot(index="result_id", columns="attempt_number", values="value")
    .rename(columns=lambda x: f"value{x}")
    .reset_index()
)

# Merge back into results
results_old = results.merge(
    attempts_wide,
    left_on="id",
    right_on="result_id",
    how="left"
)

results_old = results_old.drop(columns=["id", "result_id"])

competitions_old = dfs["competitions"].rename(columns={
    "delegates": "wcaDelegate",
    "organizers": "organiser",
    "latitude_microdegrees": "latitude",
    "longitude_microdegrees": "longitude",
    "event_specs": "eventSpecs",
    "end_month": "endMonth",
    "end_day": "endDay",
})

persons_old = dfs["persons"].drop(columns=['id']).rename(columns={
    "wca_id": "id",
    "sub_id": "subid",
})

dfs = {
    "Results": results_old,
    "Events": dfs["events"],
    "Persons": persons_old,
    "Competitions": competitions_old,
    "Countries": dfs["countries"],
    "Continents": dfs["continents"],
    "RanksSingle": dfs["ranks_single"],
    "RanksAverage": dfs["ranks_average"],
    "Staff": dfs["staff"],
    "Birthdays": dfs["birthdays"],
    "Miscellaneous": dfs["miscellaneous"],
}

In [None]:
import sqlite3

def df_to_sqlite(df, table_name):
    conn = sqlite3.connect('wca.db')

    try:
        df.to_sql(table_name, conn, if_exists='replace', index=False)
        print(f"{table_name} table created")
    except Exception as e:
        print(f"Error: {e}")
    finally:
        conn.close()

# Remove wca.db
if os.path.exists('wca.db'):
    os.remove('wca.db')

for name in dfs:
    df_to_sqlite(dfs[name], name)

In [None]:
print('Create indices')
conn = sqlite3.connect('wca.db')
c = conn.cursor()

# TODO: Analyze which of these indices are actually needed
c.execute('CREATE INDEX idx_persons_wca_id ON Persons(id);')
c.execute('CREATE INDEX idx_persons_countryId ON Persons(countryId);')
c.execute('CREATE INDEX idx_persons_continentId ON Persons(continentId);')
c.execute('CREATE INDEX idx_persons_countryKinch ON Persons(countryKinch);')
c.execute('CREATE INDEX idx_persons_continentKinch ON Persons(continentKinch);')
c.execute('CREATE INDEX idx_persons_worldKinch ON Persons(worldKinch);')
c.execute('CREATE INDEX idx_persons_countrySorSingle ON Persons(countrySorSingle);')
c.execute('CREATE INDEX idx_persons_continentSorSingle ON Persons(continentSorSingle);')
c.execute('CREATE INDEX idx_persons_worldSorSingle ON Persons(worldSorSingle);')
c.execute('CREATE INDEX idx_persons_countrySorAverage ON Persons(countrySorAverage);')
c.execute('CREATE INDEX idx_persons_continentSorAverage ON Persons(continentSorAverage);')
c.execute('CREATE INDEX idx_persons_worldSorAverage ON Persons(worldSorAverage);')
c.execute('CREATE INDEX idx_ranks_single_eventId ON RanksSingle(eventId);')
c.execute('CREATE INDEX idx_ranks_single_personId ON RanksSingle(personId);')
c.execute('CREATE INDEX idx_ranks_single_worldRank ON RanksSingle(worldRank);')
c.execute('CREATE INDEX idx_ranks_single_continentRank ON RanksSingle(continentRank);')
c.execute('CREATE INDEX idx_ranks_single_countryRank ON RanksSingle(countryRank);')
c.execute('CREATE INDEX idx_ranks_average_eventId ON RanksAverage(eventId);')
c.execute('CREATE INDEX idx_ranks_average_personId ON RanksAverage(personId);')
c.execute('CREATE INDEX idx_ranks_average_worldRank ON RanksAverage(worldRank);')
c.execute('CREATE INDEX idx_ranks_average_continentRank ON RanksAverage(continentRank);')
c.execute('CREATE INDEX idx_ranks_average_countryRank ON RanksAverage(countryRank);')
c.execute('CREATE INDEX idx_staff_wca_id ON Staff(wca_id);')

conn.commit()
conn.close()