In [3]:
import json
import pandas as pd
import urllib
import time
import datetime
import re
from bs4 import BeautifulSoup

# Base URL for power data analysis. Use URL with cache3 to speed up scraping
# and not hitting the server too hard. Use following 2 commented lines to
# get freshest data (very slow).
# base_url = "https://www.zwiftpower.com/api3.php?do=analysis_list&zwift_id="
# base_url_appendix = ""

base_url = "https://www.zwiftpower.com/cache3/profile/"
base_url_appendix = "_analysis_list.json"

##############################################################################

# top riders JSON is situated here ↓
# https://www.zwiftpower.com/cache3/lists/2_standings_.json .
# Unomment the lines below to use this list of top ~1000 riders
#with urllib.request.urlopen("https://www.zwiftpower.com/"\
#                            "cache3/lists/2_standings_.json") as response:
#    raw_json_top = response.read()
    
# all riders JSON is located here ↓
# https://www.zwiftpower.com/cache3/global/rider_list.json
# A version of the list from 2021-03-17 with most of the JSON attributes
# stripped (only inlcuding name, zwift id, ftp and rank) is available in
# the github repo and can be used by uncommenting the next lines
with open("minified_rider_list.json") as file:
    raw_json_top = file.read()

##############################################################################    
    
    
json_top_rankings_zp = json.loads(raw_json_top)

# We are interested in the the top zwifters personal data (name and ID)
top_zwifter = json_top_rankings_zp.get("data")
top_zwids, riders = [], []

# make lists of zwift_ids and rider names
for zwifter in top_zwifter:
    
    # we only want to check riders with zwiftpower rank less than 500
    if int(zwifter.get("rank")[:-3]) < 500:
        top_zwids.append(zwifter.get("zwid"))
        riders.append(zwifter.get("name"))
    
zwift_ids, setids, names, dates, titles, device_pm, device_st, \
power300_pm, power300_st, group  = [], [], [], [], [], [], [], [], [], []

progress = 0
time_start, time_request = time.time(), 0


print(f"Selected {len(top_zwids)} Zwifters with rank below 500 from"\
      f" {len(top_zwifter)} Zwifters total.")

Selected 39264 Zwifters with rank below 500 from 300205 Zwifters total.


In [None]:

# iterate over all zwifters, starting from index = progress (default=0)
for zwifter, rider in zip(top_zwids[progress:], riders[progress:]):

    # calculate remaining time
    time_rem = datetime.timedelta(seconds=((time.time() - time_start) / \
                         ((progress+1) / len(top_zwids)) - (time.time() \
                                                           - time_start)))
    # print progress
    print(f"# {progress:4}. ID: {zwifter:8}."\
          f" {progress/len(top_zwids) * 100:5.2f} % finished."\
          f" Request time: {time_request:.2f} s. Est."\
          f" {time_rem} remaining.")

    
    progress += 1
    
    # wait for some seconds – don't flood the server
    #time.sleep(0.5)
    
    time_request_start = time.time()
    # request analysis JSON for rider
    
    
    request = urllib.request.Request(base_url + str(zwifter) + base_url_appendix)
    
    # try to open the URL. Skip to next rider if there is an error
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        print(e.code)
        print(e.read()) 
        continue
    except urllib.error.URLError as e:
        continue
    
    analysis_json = response.read()
    analysis_json_obj = json.loads(analysis_json)
    time_request_end = time.time()
    time_request = time_request_end - time_request_start

    # get every race for current rider
    races = analysis_json_obj.get("data")
          
    
    # iterate over each race
    for race in races:
        
        # if the race has exactly 2 power values for 300s power...
        # ... and none of the values are zero
        if race.get("power300") and 0 not in race.get("power300"):
            num_devices = len(race.get("power300"))
        else:
            num_devices = 0
        if num_devices == 2:
            devices = []
            
            # device names are stored as name1, name2 etc.
            for i in range(num_devices):
                devices.append(race.get("name" + str(i+1)))
            
            
            if 0 in devices:
                continue

            
            # regex matching kickr but not if bike or snap is included in name
            wahoo_search = [i for i in devices if \
                            (re.search(".*kickr*", i, re.I) and not \
                             re.search(".*bike.*|.snap.*", i, re.I))]
            
            # if match for kickr and 2 devices and max 10 % difference between
            # device power readings, add values to lists.
            if wahoo_search and num_devices == 2 and \
                abs(race.get("power300")[0]/race.get("power300")[1] - 1) \
                < 0.1:
                wahoo_index = devices.index(wahoo_search[0])
                setids.append(race.get("set_id"))
                zwift_ids.append(zwifter)
                names.append(rider)
                dates.append(race.get("date"))
                titles.append(race.get("title"))
                group.append("wahoo")
                if wahoo_index == 1:
                    device_pm.append(devices[0])
                    device_st.append(devices[1])
                    power300_pm.append(race.get("power300")[0])
                    power300_st.append(race.get("power300")[1])
                else:
                    device_pm.append(devices[1])
                    device_st.append(devices[0])
                    power300_pm.append(race.get("power300")[1])
                    power300_st.append(race.get("power300")[0])
                                                   
            
            # regex matching neo but not if 2 comes after neo (except when the)
            # 2 is followed by a 0 (as in 20), so that a Tacx Neo 2017 is 
            # included in the match, but not Tacx Neo 2T
            neo_search = [i for i in devices if \
                          (re.search(".*neo(?![2t])", i, re.I) and not\
                           re.search("2(?!0)", i, re.I) and not\
                           re.search(".*bike.*", i, re.I))]
            
            if neo_search and num_devices == 2 and \
                abs(race.get("power300")[0]/race.get("power300")[1] - 1) \
                < 0.1:
                neo_index = devices.index(neo_search[0])
                setids.append(race.get("set_id"))
                zwift_ids.append(zwifter)
                names.append(rider)
                dates.append(race.get("date"))
                titles.append(race.get("title"))
                group.append("neo")
                if neo_index == 1:
                    device_pm.append(devices[0])
                    device_st.append(devices[1])
                    power300_pm.append(race.get("power300")[0])
                    power300_st.append(race.get("power300")[1])
                else:
                    device_pm.append(devices[1])
                    device_st.append(devices[0])
                    power300_pm.append(race.get("power300")[1])
                    power300_st.append(race.get("power300")[0])

print(f"Finished! Saved {len(zwift_ids)} rows in {time.time()-time_start} s.")

In [5]:
# mash all of the lists into a much more efficient pandas dataframe structure
df = pd.DataFrame({"zwift_id": zwift_ids, "name": names, "time":\
                   pd.to_datetime(dates, unit="s"), \
                   "group": group, "title": titles, "pm": device_pm,\
                   "st": device_st, "p300s_pm": power300_pm,\
                   "p300s_st": power300_st}, index=setids)

# drop the duplicate values. no mercy!
df = df[~df.duplicated(keep="first")]

# calculate the delta (diffrence) between power meter and smart trainer
df["delta"] = df["p300s_pm"] - df["p300s_st"]


In [35]:
# make a new column with most common power meter brands for later analysis
pm_brands = ["_quarq", "_favero", "_favero", "_srm", "_power2max", \
             "_power2max", "_4iiii", "_stages", "_vector", "_rotor",\
             "_powertap"]
pm_regexs = ["(?i).*quarq.*", "(?i).*favero.*", "(?i).*assioma.*", \
             "(?i).*srm.*", "(?i).*power2max.*", "(?i).*p2m.*", \
             "(?i).*4iiii.*", "(?i).*stages.*", "(?i).*vector.*", \
             "(?i).*rotor.*", "(?i).*powertap.*"]
df["pm_brand"] = df['pm'].replace(pm_regexs, pm_brands, regex=True)
df.loc[df["pm_brand"].isin(pm_brands)].groupby("pm_brand").mean()

Unnamed: 0_level_0,zwift_id,p300s_pm,p300s_st,delta
pm_brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
_4iiii,762683.999008,339.289683,338.875,0.414683
_favero,642969.914568,326.360698,324.983005,1.377693
_power2max,396926.465148,340.707384,339.694962,1.012422
_powertap,497423.264925,338.682836,340.483209,-1.800373
_quarq,461275.907264,359.088099,354.360124,4.727975
_rotor,353671.611189,371.545455,369.742657,1.802797
_srm,602086.204959,351.996694,350.795041,1.201653
_stages,662831.64905,336.225434,337.52436,-1.298927
_vector,477169.884153,335.530079,335.556892,-0.026813


In [39]:
len(df["zwift_id"].loc[df["group"] == "neo"].unique())

336

In [6]:
# print key stats (mean + std.dev) for the Kickr and Neo
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
print(f"Wahoo Kickr statistics:")
print(df.loc[df["group"] == "wahoo"].describe())
print(f"\n\n Tacx Neo statistics:")
print(df.loc[df["group"] == "neo"].describe())

Wahoo Kickr statistics:
           zwift_id      p300s_pm      p300s_st         delta
count  1.986600e+04  19866.000000  19866.000000  19866.000000
mean   6.005959e+05    340.369022    339.341790      1.027232
std    6.259259e+05     65.239106     65.069075     10.113636
min    4.990000e+02     59.000000     57.000000    -41.000000
25%    1.275610e+05    301.000000    300.000000     -5.000000
50%    3.979060e+05    347.000000    346.000000      1.000000
75%    8.929640e+05    385.000000    384.000000      7.000000
max    3.649768e+06    529.000000    540.000000     48.000000


 Tacx Neo statistics:
           zwift_id     p300s_pm     p300s_st        delta
count  3.234000e+03  3234.000000  3234.000000  3234.000000
mean   3.440202e+05   328.300866   324.958874     3.341991
std    4.105282e+05    56.804989    56.634577     9.279716
min    3.200000e+02    94.000000    87.000000   -35.000000
25%    7.964800e+04   296.000000   292.000000    -2.000000
50%    1.815040e+05   332.000000   329.0

In [36]:
# export to csv without rider names
df.drop("name", axis = 1).to_csv("data2.csv")