In [38]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras

In [39]:
def stand(x):
  return (x-np.nanmean(x)) / np.nanstd(x)

def cofi_cost_func(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    f_x = tf.linalg.matmul(X, tf.transpose(W)) + b
    E = (f_x - Y)**2
    j = E*R
    J = 0.5 * tf.reduce_sum(j) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [40]:
results_df = pd.read_csv("https://raw.githubusercontent.com/schubertjan/race-recommender/master/race_results.csv")
results_df = results_df.set_index("Race")
results_df.shape

(270, 5855)

In [41]:
Y = results_df.to_numpy()
# with logs
# Y_norm = results_df.apply(lambda row: stand(np.log(row + 1)), axis=1).to_numpy()
# Y_norm[np.isnan(Y_norm)] = -99

# Y_mean = np.nanmean(np.log(Y+1), axis=1)
# Y_std = np.nanstd(np.log(Y+1), axis=1)

# no logs
Y_norm = results_df.apply(lambda row: stand(row), axis=1).to_numpy()
Y_norm[np.isnan(Y_norm)] = -99

Y_mean = np.nanmean(Y, axis=1)
Y_std = np.nanstd(Y, axis=1)

R = np.ones(Y.shape)
R[np.isnan(Y)] = 0

In [42]:
#  Useful Values
num_races, num_riders = Y.shape
num_features = 150
lambda_ = 1
# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_riders,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_races, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_riders),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=0.1)

iterations = 1000
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func(X, W, b, Y_norm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 5225770.0
Training loss at iteration 20: 357844.2
Training loss at iteration 40: 142841.4
Training loss at iteration 60: 77798.7
Training loss at iteration 80: 49409.4
Training loss at iteration 100: 34686.3
Training loss at iteration 120: 26089.7
Training loss at iteration 140: 20676.8
Training loss at iteration 160: 17081.1
Training loss at iteration 180: 14587.6
Training loss at iteration 200: 12795.8
Training loss at iteration 220: 11469.4
Training loss at iteration 240: 10463.0
Training loss at iteration 260: 9683.9
Training loss at iteration 280: 9070.5
Training loss at iteration 300: 8581.0
Training loss at iteration 320: 8185.9
Training loss at iteration 340: 7863.9
Training loss at iteration 360: 7599.3
Training loss at iteration 380: 7380.3
Training loss at iteration 400: 7197.7
Training loss at iteration 420: 7044.7
Training loss at iteration 440: 6915.5
Training loss at iteration 460: 6806.1
Training loss at iteration 480: 6712.7
Training loss 

In [43]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

#restore the mean
pm_e = np.multiply(p, Y_std.reshape(p.shape[0], 1)) + Y_mean.reshape(p.shape[0], 1)

# # use a log
# pm = np.exp(pm_e)-1
# don't log
pm = pm_e

In [44]:
rider_name = "roglič primož"
j = results_df.columns.isin([rider_name])
rider = np.arange(0, results_df.shape[1])[j]

check = pd.DataFrame(data={"y_hat": pm[:, rider].reshape(-1,), "y": results_df.iloc[:, rider].values.reshape(-1,)}, index = results_df.index)
check.dropna()

Unnamed: 0_level_0,y_hat,y
Race,Unnamed: 1_level_1,Unnamed: 2_level_1
amstel-gold,21.393745,0.0
dauphine,68.080393,67.5
il-lombardia,280.621101,275.0
itzulia-basque-country,69.743317,70.0
la-fleche-wallone,315.384202,320.0
liege-bastogne-liege,282.944308,275.0
milano-sanremo,39.021591,30.0
paris-nice,150.041089,155.0
tour-de-france,133.401746,121.666667
tour-de-l-ain,32.141618,33.0


In [45]:
check.sort_values("y_hat", ascending=False)

Unnamed: 0_level_0,y_hat,y
Race,Unnamed: 1_level_1,Unnamed: 2_level_1
vuelta-a-espana,537.670684,548.0
san-sebastian,471.602763,
paris-roubaix,430.651012,
gent-wevelgem,402.725424,
bretagne-classic,381.814669,
...,...,...
trofeo-citta-di-meldola-g.p.-awc-event,0.000000,
european-continental-championships-wj-road,0.000000,
commonwealth-games,0.000000,
commonwealth-games-itt2,0.000000,


In [33]:
results_df.max(axis=1)["world-championship-we"]

600.0

In [46]:
from google.colab import files

pd.DataFrame(pm, index = results_df.index, columns=results_df.columns, dtype = int).to_csv("y_hat.csv", index=True)

files.download('y_hat.csv')

  This is separate from the ipykernel package so we can avoid doing imports until


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>