In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras

In [2]:
def stand(x):
  return (x-np.nanmean(x)) / np.nanstd(x)

def cofi_cost_func(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    f_x = tf.linalg.matmul(X, tf.transpose(W)) + b
    E = (f_x - Y)**2
    j = E*R
    J = 0.5 * tf.reduce_sum(j) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [3]:
results_df = pd.read_csv("https://raw.githubusercontent.com/schubertjan/race-recommender/master/race_results.csv")
results_df = results_df.set_index("Race")
results_df

Unnamed: 0_level_0,aagaard hansen tobias,aalerud katrine,aalrust håkon,aalto jimi,aas eirik vang,aasheim ludvig,aaskov pallesen jeppe,aasvold kristian,abay burak,abazi qendrim,...,štybar zdeněk,šēlis jānis,ťoupalík adam,ťoupalík jakub,żelazowski michał,żuber adam,żurek jakub,žigart urška,žumer matic,țvetcov serghei
Race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2-districtenpijl-ekeren-deurne-me,,,,,,,,,,,...,,,,,,,,,,
Eschborn-Frankfurt,,,,,,,,,,,...,,,,,,,,,,
adriatica-ionica,,,,,,,,,,,...,,,,,,,,,,
albani-classic-fyen-rundt,,,,,,,,,,,...,,,,,,,,,,
amstel-gold,,,,,,,,,,,...,20.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vuelta-ciclista-a-la-region-de-murcia,,,,,,,,0.0,,,...,,,,,,,,,,
world-championship,,,,,,,,,,,...,,,0.0,,,,,,,
world-championship-we,,20.0,,,,,,,,,...,,,,,,,,0.0,,
youngster-coast-challenge,,,,,,,,,,,...,,,,,,,,,,


In [4]:
Y = results_df.to_numpy()
Y_norm = results_df.apply(lambda row: stand(np.log(row + 1)), axis=1).to_numpy()
Y_norm[np.isnan(Y_norm)] = -99

Y_mean = np.nanmean(np.log(Y+1), axis=1)
Y_std = np.nanstd(np.log(Y+1), axis=1)

R = np.ones(Y.shape)
R[np.isnan(Y)] = 0

In [5]:
#  Useful Values
num_races, num_riders = Y.shape
num_features = 100
lambda_ = 1
# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_riders,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_races, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_riders),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

iterations = 1000
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func(X, W, b, Y_norm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 4096813.5
Training loss at iteration 20: 210955.5
Training loss at iteration 40: 88783.4
Training loss at iteration 60: 49732.9
Training loss at iteration 80: 33001.4
Training loss at iteration 100: 24633.9
Training loss at iteration 120: 19784.4
Training loss at iteration 140: 16671.6
Training loss at iteration 160: 14530.1
Training loss at iteration 180: 12977.0
Training loss at iteration 200: 11804.2
Training loss at iteration 220: 10891.2
Training loss at iteration 240: 10163.9
Training loss at iteration 260: 9574.6
Training loss at iteration 280: 9090.6
Training loss at iteration 300: 8689.0
Training loss at iteration 320: 8352.8
Training loss at iteration 340: 8069.4
Training loss at iteration 360: 7828.8
Training loss at iteration 380: 7623.5
Training loss at iteration 400: 7447.4
Training loss at iteration 420: 7295.5
Training loss at iteration 440: 7163.9
Training loss at iteration 460: 7049.4
Training loss at iteration 480: 6949.4
Training loss a

In [6]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

#restore the mean
pm_e = np.multiply(p, Y_std.reshape(p.shape[0], 1)) + Y_mean.reshape(p.shape[0], 1)
pm = np.exp(pm_e)-1

In [7]:
rider_name = "kristoff alexander"
j = results_df.columns.isin([rider_name])
rider = np.arange(0, results_df.shape[1])[j]

check = pd.DataFrame(data={"y_hat": pm[:, rider].reshape(-1,), "y": results_df.iloc[:, rider].values.reshape(-1,)}, index = results_df.index)
check.dropna()

Unnamed: 0_level_0,y_hat,y
Race,Unnamed: 1_level_1,Unnamed: 2_level_1
Eschborn-Frankfurt,184.609912,215.0
clasica-de-almeria,119.443921,150.0
dauphine,0.095111,0.0
deia-trophy,44.0024,50.0
dwars-door-vlaanderen,51.435339,60.0
e3-harelbeke,0.443489,0.0
gent-wevelgem,38.386694,40.0
gp-du-canton-d-argovie,0.116388,0.0
gp-jef-scherens,59.775953,70.0
kuurne-brussel-kuurne,102.994396,125.0


In [8]:
from google.colab import files

pd.DataFrame(pm, index = results_df.index, columns=results_df.columns, dtype = int).to_csv("y_hat.csv", index=True)

files.download('y_hat.csv')

  This is separate from the ipykernel package so we can avoid doing imports until


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>