In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras

In [7]:
def stand(x):
  return (x-np.nanmean(x)) / np.nanstd(x)

def cofi_cost_func(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    f_x = tf.linalg.matmul(X, tf.transpose(W)) + b
    E = (f_x - Y)**2
    j = E*R
    J = 0.5 * tf.reduce_sum(j) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [8]:
results_df = pd.read_csv("https://raw.githubusercontent.com/schubertjan/race-recommender/master/race_results.csv")
results_df = results_df.set_index("Race")
results_df.columns = [col.lower() for col in results_df.columns]
results_df

Unnamed: 0_level_0,aagaard hansen tobias,aalerud katrine,aalrust håkon,aas eirik vang,aasheim ludvig,aaskov pallesen jeppe,aasvold kristian,abay burak,abbas yaser,abbasov ibad,...,štoček matúš,štybar zdeněk,ťoupalík adam,ťoupalík jakub,żelazowski michał,żuber adam,żurek jakub,žigart urška,žumer matic,țvetcov serghei
Race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2-districtenpijl-ekeren-deurne-me,,,,,,,,,,,...,,,,,,,,,,
Eschborn-Frankfurt,,,,,,,,,,,...,,,,,,,,,,
albani-classic-fyen-rundt,,,,,,,,,,,...,0.0,,,,,,,,,
amstel-gold-2021,,,,,,,,,,,...,,,,,,,,,,
amstel-gold-2022,,,,,,,,,,,...,,20.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vuelta-ciclista-a-la-region-de-murcia,,,,,,,,,,,...,,,,,,,,,,
world-championship,,,,,,,,,,,...,,,0.0,,,,,,,
world-championship-we,,20.0,,,,,,,,,...,,,,,,,,0.0,,
youngster-coast-challenge,,,,,,,,,,,...,,,,,,,,,,


In [9]:
Y = results_df.to_numpy()
Y_norm = results_df.apply(lambda row: stand(np.log(row + 1)), axis=1).to_numpy()
Y_norm[np.isnan(Y_norm)] = -99

Y_mean = np.nanmean(np.log(Y+1), axis=1)
Y_std = np.nanstd(np.log(Y+1), axis=1)

R = np.ones(Y.shape)
R[np.isnan(Y)] = 0

In [15]:
#  Useful Values
num_races, num_riders = Y.shape
num_features = 100
lambda_ = 1
# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_riders,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_races, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_riders),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

iterations = 1000
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func(X, W, b, Y_norm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 3207982.9
Training loss at iteration 20: 175152.0
Training loss at iteration 40: 72002.5
Training loss at iteration 60: 39691.9
Training loss at iteration 80: 27006.5
Training loss at iteration 100: 20741.8
Training loss at iteration 120: 17068.6
Training loss at iteration 140: 14655.4
Training loss at iteration 160: 12941.4
Training loss at iteration 180: 11654.8
Training loss at iteration 200: 10650.9
Training loss at iteration 220: 9846.6
Training loss at iteration 240: 9190.0
Training loss at iteration 260: 8647.3
Training loss at iteration 280: 8194.4
Training loss at iteration 300: 7813.8
Training loss at iteration 320: 7492.1
Training loss at iteration 340: 7218.8
Training loss at iteration 360: 6985.7
Training loss at iteration 380: 6786.0
Training loss at iteration 400: 6614.3
Training loss at iteration 420: 6466.0
Training loss at iteration 440: 6337.5
Training loss at iteration 460: 6225.8
Training loss at iteration 480: 6128.2
Training loss at 

In [16]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

#restore the mean
pm_e = np.multiply(p, Y_std.reshape(p.shape[0], 1)) + Y_mean.reshape(p.shape[0], 1)
pm = np.exp(pm_e)-1

In [17]:
rider_name = "kristoff alexander"
j = results_df.columns.isin([rider_name])
rider = np.arange(0, results_df.shape[1])[j]

check = pd.DataFrame(data={"y_hat": pm[:, rider].reshape(-1,), "y": results_df.iloc[:, rider].values.reshape(-1,)}, index = results_df.index)
check.dropna()

Unnamed: 0_level_0,y_hat,y
Race,Unnamed: 1_level_1,Unnamed: 2_level_1
Eschborn-Frankfurt,183.710666,215.0
clasica-de-almeria,119.320179,150.0
deia-trophy,45.012842,50.0
dwars-door-vlaanderen,52.906716,60.0
e3-harelbeke,0.417374,0.0
gent-wevelgem,38.364694,40.0
gp-du-canton-d-argovie,0.115778,0.0
gp-jef-scherens,59.74868,70.0
kuurne-brussel-kuurne,102.029117,125.0
milano-sanremo,6.813578,6.666667


In [18]:
from google.colab import files

pd.DataFrame(pm, index = results_df.index, columns=results_df.columns).to_csv("y_hat.csv", index=False)

files.download('y_hat.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>