In [278]:
import requests 
import time

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

In [279]:
def clean_df(df, race, year):
  for index, row in df.iterrows():
    # clean rider names
    df.loc[index, "Rider"] = row["Rider"].replace(row["Team"], "")
    # make sure we have 0 when participated and NaN when not participated
    if (~row[["Rnk"]].isin(["DNF", "DNS"])[0]) & row[["UCI"]].isna()[0]:
      df.loc[index, "UCI"] = 0
    
    # extract info
  df2 = df[["Rider", "UCI"]]
          # add meta data
  df2["Race"] = race.replace("race/", "").replace(f"/{year}", "").replace(f"-{year}", "")
  df2["Year"] = year
  return df2

def extract_stage_urls(r):
  soup = BeautifulSoup(r.content)
  hrefs = [line.get("href") for line in soup.select("a")]
  hrefs = [href for href in hrefs if href is not None]
  hrefs = [href for href in hrefs if href.find("stage-") > 0]
  last_stage = [href for href in hrefs if len(re.findall(r'\d+$', href)) > 0][0]

  last_stage_number = int(last_stage.split("-")[-1])
  stage_numbers = np.arange(1,last_stage_number + 1)

  stage_urls = []
  for stage_num in stage_numbers:
    stage_urls.append("-".join(last_stage.split("-")[:-1] + [str(stage_num)]))
  return stage_urls

def extract_results(year: str, circuit: int) -> pd.DataFrame:

  header = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
  }

  base_url = "https://www.procyclingstats.com/"
  r = requests.get(f"{base_url}races.php?year={year}&circuit={circuit}&class=&filter=Filter", headers=header)

  soup = BeautifulSoup(r.content)

  href = []

  for line in soup.find_all('a'):
      href.append(line.get('href'))
    
  race_url = [line for line in href if line.find("race/") == 0]
  race_url = [line for line in race_url if line.find(year) == (len(line)-4)]

  results = []

  for race in race_url:
    # dont use nc
    if ("nc-" in race) | ("national-championships" in race):
      print(f"NC race: {race}")
    else:  
      try:
        r = requests.get(base_url + race, headers=header)
        time.sleep(3)
        result = pd.read_html(r.text)[0]
        if "GC" not in result.columns:
          # do some data cleaning
          result = clean_df(result, race, year)
          results.append(result)
          print(f"Extracting 1day race:: {race}")
        else:
          stage_urls = extract_stage_urls(r)
          stages = []
          # extract results for all stages and sum
          for stage_url in stage_urls:
            print(f"Extracting stage race: {stage_url}")
            r_stage = requests.get(base_url + stage_url, headers=header)
            time.sleep(3)
            stage_df = pd.read_html(r_stage.text)[0]
            stages.append(clean_df(stage_df, race, year))
          stages_df = pd.concat(stages)
          stage_summary = stages_df.groupby(["Rider", "Race", "Year"], as_index=False)["UCI"].sum()[["Rider", "UCI", "Race", "Year"]]
          results.append(stage_summary)
      except:
        print(f"Extraction failed: {race}")
    
  return pd.concat(results)

In [280]:
years = ["2020", "2021", "2022"]
circuits = [1, 13]
results = []
for year in years:
  for circuit in circuits:
    try: 
      results.append(extract_results(year=year, circuit=circuit))
    except:
      pass

Extracting stage race: race/tour-down-under/2020/stage-1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Extracting stage race: race/tour-down-under/2020/stage-2
Extracting stage race: race/tour-down-under/2020/stage-3
Extracting stage race: race/tour-down-under/2020/stage-4
Extracting stage race: race/tour-down-under/2020/stage-5
Extracting stage race: race/tour-down-under/2020/stage-6
Extracting 1day race:: race/great-ocean-race/2020
Extracting stage race: race/uae-tour/2020/stage-1
Extracting stage race: race/uae-tour/2020/stage-2
Extracting stage race: race/uae-tour/2020/stage-3
Extracting stage race: race/uae-tour/2020/stage-4
Extracting stage race: race/uae-tour/2020/stage-5
Extracting 1day race:: race/omloop-het-nieuwsblad/2020
Extracting stage race: race/paris-nice/2020/stage-1
Extracting stage race: race/paris-nice/2020/stage-2
Extracting stage race: race/paris-nice/2020/stage-3
Extracting stage race: race/paris-nice/2020/stage-4
Extracting stage race: race/paris-nice/2020/stage-5
Extracting stage race: race/paris-nice/2020/stage-6
Extracting stage race: race/paris-nice/2020/stag

In [282]:
results_df = pd.concat(results)
results_df = results_df.pivot_table(index="Race", columns="Rider", values="UCI", aggfunc="mean")
results_df.columns = [col.lower() for col in results_df.columns]

In [283]:
results_df.shape

(272, 5956)

In [284]:
from google.colab import files

results_df.to_csv('race_results.csv', index=True)

files.download('race_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# year = "2022"
# circuit=13
# header = {
#     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
#     "X-Requested-With": "XMLHttpRequest"
#   }
# base_url = "https://www.procyclingstats.com/"
# r = requests.get(f"{base_url}races.php?year={year}&circuit={circuit}&class=&filter=Filter", headers=header)

# soup = BeautifulSoup(r.content)

# href = []

# for line in soup.find_all('a'):
#   href.append(line.get('href'))


# race_url = [line for line in href if line.find("race/") == 0]
# race_url = [line for line in race_url if line.find(year) == (len(line)-4)]
# race_url

# results = []

# for race in race_url:
#   r = requests.get(base_url + race, headers=header)
#   time.sleep(2)
#   print(race)
#   result = pd.read_html(r.text)[0]
#   if "GC" not in result.columns:
#     aux = result[["Rider", "Pnt"]]
#     aux["Race"] = race
#     aux["Year"] = year
#     results.append(aux)
  
# results = pd.concat(results)