In [3]:
import requests 
import time
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras
from bs4 import BeautifulSoup

In [4]:
def extract_results(year: str, circuit: int) -> pd.DataFrame:

  header = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
  }

  base_url = "https://www.procyclingstats.com/"
  r = requests.get(f"{base_url}races.php?year={year}&circuit={circuit}&class=&filter=Filter", headers=header)

  soup = BeautifulSoup(r.content)

  href = []

  for line in soup.find_all('a'):
      href.append(line.get('href'))
    
  race_url = [line for line in href if line.find("race/") == 0]
  race_url = [line for line in race_url if line.find(year) == (len(line)-4)]

  results = []

  for race in race_url:
    # dont use nc
    if ("nc-" in race) | ("national-championships" in race):
      print(f"NC race: {race}")
    else:  
      try:
        r = requests.get(base_url + race, headers=header)
        time.sleep(3)
        result = pd.read_html(r.text)[0]
        if "GC" not in result.columns:
          # do some data cleaning
          for index, row in result.iterrows():
            # clean rider names
            result.loc[index, "Rider"] = row["Rider"].replace(row["Team"], "")
            # make sure we have 0 when participated and NaN when not participated
            if (~row[["Rnk"]].isin(["DNF", "DNS"])[0]) & row[["UCI"]].isna()[0]:
              result.loc[index, "UCI"] = 0
            
          # extract info
          aux = result[["Rider", "UCI"]]
          # add meta data
          aux["Race"] = race.replace("race/", "").replace(f"/{year}", "")
          aux["Year"] = year
          results.append(aux)
          print(f"Extracting: {race}")
        else:
          print(f"Stage race: {race}")
      except:
        print(f"Extraction failed: {race}")
    
  return pd.concat(results)

def stand(x):
  return (x-np.nanmean(x)) / np.nanstd(x)

In [5]:
def cofi_cost_func(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    f_x = tf.linalg.matmul(X, tf.transpose(W)) + b
    E = (f_x - Y)**2
    j = E*R
    J = 0.5 * tf.reduce_sum(j) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [3]:
years = ["2020", "2021", "2022"]
circuits = [1, 13]
results = []
for year in years:
  for circuit in circuits:
    try: 
      results.append(extract_results(year=year, circuit=circuit))
    except:
      pass

Stage race: race/tour-down-under/2020


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Extracting: race/great-ocean-race/2020
Stage race: race/uae-tour/2020
Extracting: race/omloop-het-nieuwsblad/2020
Stage race: race/paris-nice/2020
Extracting: race/strade-bianche/2020
Stage race: race/tour-de-pologne/2020
Extracting: race/milano-sanremo/2020
Stage race: race/dauphine/2020
Extracting: race/il-lombardia/2020
Extracting: race/bretagne-classic/2020
Stage race: race/tour-de-france/2020
Stage race: race/tirreno-adriatico/2020
Stage race: race/benelux-tour/2020
Extracting: race/la-fleche-wallone/2020
Stage race: race/giro-d-italia/2020
Extracting: race/liege-bastogne-liege/2020
Extracting: race/gent-wevelgem/2020
Extracting: race/ronde-van-vlaanderen/2020
Stage race: race/vuelta-a-espana/2020
Extracting: race/oxyclean-classic-brugge-de-panne/2020
Extracting: race/gp-belek/2020
Extracting: race/trofeo-cala-millor/2020
Extracting: race/deia-trophy/2020
Extracting: race/trofeo-andratx-mirador-d-es-colomer/2020
Extracting: race/gp-d-ouverture/2020
Extracting: race/trofeo-palma/20

In [5]:
results_df = pd.concat(results)
results_df = results_df.pivot_table(index="Race", columns="Rider", values="UCI", aggfunc="mean")

In [6]:
results_df.shape

(196, 5087)

In [8]:
from google.colab import files

results_df.to_csv('race_results.csv', index=True)

files.download('race_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
Y = results_df.to_numpy()
Y_norm = results_df.apply(lambda row: stand(row), axis=1).to_numpy()
Y_norm[results_df.isna().to_numpy()] = -99

R = np.zeros(Y.shape)
R[results_df.notna().to_numpy()] = 1

NameError: ignored

NameError: ignored

In [52]:
Y_norm

nan

In [49]:
#  Useful Values
num_races, num_riders = Y.shape
num_features = 100
lambda_ = 0.001
# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_riders,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_races, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_riders),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

iterations = 200
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func(X, W, b, Y_norm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: nan
Training loss at iteration 20: nan
Training loss at iteration 40: nan


KeyboardInterrupt: ignored

In [None]:
# year = "2022"
# circuit=13
# header = {
#     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
#     "X-Requested-With": "XMLHttpRequest"
#   }
# base_url = "https://www.procyclingstats.com/"
# r = requests.get(f"{base_url}races.php?year={year}&circuit={circuit}&class=&filter=Filter", headers=header)

# soup = BeautifulSoup(r.content)

# href = []

# for line in soup.find_all('a'):
#   href.append(line.get('href'))


# race_url = [line for line in href if line.find("race/") == 0]
# race_url = [line for line in race_url if line.find(year) == (len(line)-4)]
# race_url

# results = []

# for race in race_url:
#   r = requests.get(base_url + race, headers=header)
#   time.sleep(2)
#   print(race)
#   result = pd.read_html(r.text)[0]
#   if "GC" not in result.columns:
#     aux = result[["Rider", "Pnt"]]
#     aux["Race"] = race
#     aux["Year"] = year
#     results.append(aux)
  
# results = pd.concat(results)