Factors not captured in the analysis:
- Parkour of the race
- Abandonments and unexpected time loss
    - Riders who lost time due to crashes, injuries, illness and other reasons out of their control.
- The weather
    - Some riders do better in wet, windy or hot conditions than others.
    
Factors not used due to potential inaccuracies:
- Height
    - Value may not be accurate in all cases.
- Weight
    - The value taken for each rider is their most recent value. 
    - A rider's weight fluxuates often during a season.
    - The assumption is that every rider at the Tour will be at their ideal weight coming into it.

# Import Libraries and Set Directories

In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn import preprocessing
from sklearn.impute import KNNImputer

from sklearn.metrics import accuracy_score

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

#import plotly.graph_objs as go
#from plotly.offline import iplot

import matplotlib as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [3]:
import tensorflow.compat.v2.feature_column as fc
import tensorflow as tf
from tensorflow.keras.models import Sequential

import keras
import keras.backend as kb
from keras.layers import Dense, Dropout, LSTM, BatchNormalization
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint
from keras.optimizers import adam_v2

In [4]:
dPath = "D:/Data/cycling/"

# Define Functions

## Clean 2021 Data

In [5]:
def clean_2021_data(tdf_2021):
    tdf_2021.replace("AttError","Unknown",inplace=True)
    
    # Fill missing data for height and weight
    tdf_2021["Weight"] = np.where(tdf_2021["URL"]=="mark-donovan", 70, tdf_2021["Height"])

    # Create GC and SBW columns
    tdf_2021["GC"] = np.nan
    tdf_2021["Seconds Behind Winner"] = np.nan
    
    # Insert Team Leader Data
    tdf_2021["Team Leader"] = 0
    # UAE
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="tadej-pogacar", 1, tdf_2021["Team Leader"])
    # Lotto-NL
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="primoz-roglic", 1, tdf_2021["Team Leader"])
    # Ineos
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="geraint-thomas", 1, tdf_2021["Team Leader"])
    # Israel
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="christopher-froome", 1, tdf_2021["Team Leader"])
    # Trek
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="vincenzo-nibali", 1, tdf_2021["Team Leader"])
    # Quickstep
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="julian-alaphilippe", 1, tdf_2021["Team Leader"])
    # Movistar
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="miguel-angel-lopez", 1, tdf_2021["Team Leader"])
    # BORA
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="peter-sagan", 1, tdf_2021["Team Leader"])
    # FDJ
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="david-gaudu", 1, tdf_2021["Team Leader"])
    # Cofidis
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="guillaume-martin", 1, tdf_2021["Team Leader"])
    # Alpecin
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="mathieu-van-der-poel", 1, tdf_2021["Team Leader"])
    # EF
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="rigoberto-uran", 1, tdf_2021["Team Leader"])
    # AG2R
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="benoit-cosnefroy", 1, tdf_2021["Team Leader"])
    # Arkea
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="warren-barguil", 1, tdf_2021["Team Leader"])
    # DSM
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="soren-kragh-andersen", 1, tdf_2021["Team Leader"])
    # Lotto Soudal
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="caleb-ewan", 1, tdf_2021["Team Leader"])
    # Bahrain
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="jack-haig", 1, tdf_2021["Team Leader"])
    # BikeExchange
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="michael-matthews", 1, tdf_2021["Team Leader"])
    # Astana
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="jakob-fuglsang", 1, tdf_2021["Team Leader"])
    # Quebeka
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="sergio-luis-henao", 1, tdf_2021["Team Leader"])
    #Total Energies
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="pierre-latour", 1, tdf_2021["Team Leader"])
    # Wanty
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="louis-meintjes", 1, tdf_2021["Team Leader"])
    # B&B
    tdf_2021["Team Leader"] = np.where(tdf_2021["URL"]=="bryan-coquard", 1, tdf_2021["Team Leader"])

    return tdf_2021

## Impute Nulls

In [6]:
def ImputeNulls(df):
    df = df[df["GC"]!="DNF"]
    #all_cols = df.columns.to_list()
    #cols = [e for e in all_cols if e not in ('Rider','URL','GC','Seconds Behind Winner','Top10','Team','Nationality')]
    
    cols = ["Age","AverageResult_2UWT","AverageResult_2HC","BestResult_2UWT","BestResult_2HC","BestGTFinish",
            "numGTs","WT"]
    
    imputer = KNNImputer(n_neighbors=2)
    df[cols] = imputer.fit_transform(df[cols])
    
    return df

## Add Team Information

In [7]:
def AddTeammateData(tdf_train):
    tdf_team = pd.DataFrame()

    for team in tdf_train["Team"].unique():
        df = tdf_train[tdf_train["Team"]==team]
        
        # remove df["Team Leader"].max()

        for rider in df["Rider"].unique():
            r = df[df["Rider"]==rider]
            teammates = df[df["Rider"]!=rider]
            
            teammates = teammates[teammates["Team Leader"]!=1]

            teammates = teammates.groupby(["Team","Year"]).agg(Age_Team=("Age","mean"),
                                                               numGTs_Team=("numGTs","sum"),
                                                  BestGTFinish_Team=("BestGTFinish","max")).reset_index()
            #dom.rename(columns={"Age":"Age_Team","numGTs":"numGTs_Team","Starts":"Starts_Team"}, inplace=True)
            r = r.merge(teammates, on=['Team','Year'], how='left')

            tdf_team = tdf_team.append(r)

    return tdf_team

## Clean Train Data

In [8]:
def clean_train_data(tdf_train):
    # Remove rider with errors
    tdf_train = tdf_train[tdf_train["Rider"]!='BONNET WILLIAM']
    
    # Insert Years Since First/Last Grand Tour values
    tdf_train["YearsSinceFirstGT"] = -1
    tdf_train["YearsSinceFirstGT"] = np.where(tdf_train["FirstGT"].isna(), tdf_train["YearsSinceFirstGT"], tdf_train["Year"]-tdf_train["FirstGT"])
    tdf_train["YearsSinceLastGT"] = -1
    tdf_train["YearsSinceLastGT"] = np.where(tdf_train["LastGT"].isna(), tdf_train["YearsSinceLastGT"], tdf_train["Year"]-tdf_train["LastGT"])
    # Add value for whether rider rode Giro d'Italia that year or not
    tdf_train["Giro"] = np.where((tdf_train["YearsSinceLastGT"]==0) & (tdf_train["Year"]!=2020) , 1, 0)
    
    # Fill NA values for specific columns
    tdf_train["numTours"].fillna(0, inplace=True)
    tdf_train["numGiros"].fillna(0, inplace=True)
    tdf_train["numVueltas"].fillna(0, inplace=True)
    tdf_train["numGTs"].fillna(0, inplace=True)
    tdf_train["NumGTDNFs"].fillna(0, inplace=True)
    tdf_train["FirstGT"].fillna(tdf_train["Year"], inplace=True)
    tdf_train["LastGT"].fillna(tdf_train["Year"], inplace=True)
    tdf_train["FirstTour"].fillna(tdf_train["Year"], inplace=True)
    tdf_train["LastTour"].fillna(tdf_train["Year"], inplace=True)
    
    # Merge with Team Data to get value for whether rider rides for World-Tour team or not
    tdf_train = tdf_train.merge(teams, on=['Team','Year'], how='left')
    tdf_train["WT"] = np.where(tdf_train["Rank"]=="WT", 1, 0)
    del tdf_train["Rank"]
    
    # Impute Nulls for specific columns
    tdf_train = ImputeNulls(tdf_train)
    
    # Add Team information
    tdf_train = AddTeammateData(tdf_train)
        
    return tdf_train

In [9]:
# Binary Variables = 'Team Leader', 'Giro', 'WT'

## Normalize Data

In [10]:
def NormalizeData(df):
    all_cols = df.columns.to_list()
    #cols = [e for e in all_cols if e not in ('Rider','GC','Seconds Behind Winner','Top10','Year','Team','URL','Nationality')]
    # Include Binary Variables = 'Team Leader', 'Giro', 'WT'
    cols = [e for e in all_cols if e not in ('Rider','GC','Seconds Behind Winner','Top10','Year','Team','URL','Nationality','Team Leader', 'Giro', 'WT')]
    df[cols] = preprocessing.MinMaxScaler().fit_transform(df[cols])
    return df

## Variance Inflation Factor

In [11]:
def calc_vif(df, features):
    df = df[features]
    # Calculate VIF
    vif = pd.DataFrame()
    vif["Variables"] = features
    vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    
    print("Variance Inflation Factor (if higher than 5 may need reviewing):")
    display(vif)

## Model Statistics

In [12]:
# creating function to get model statistics
def get_stats(df, predictor, features):
    results = sm.OLS(df[predictor], df[features]).fit()
    print(results.summary())

## Heatmap/Correlations

In [13]:
# Correlate and plot heatmap
def heatmap(df, predictor, features):
    features_ = features.copy()
    features_.append(predictor)
    hmap = df[features_]
    sns.heatmap(hmap.corr().round(2)[[predictor]].sort_values(predictor, ascending=False), cmap="coolwarm", vmin=-1, vmax=1)
    print("Correlations:")
    display(hmap.corr().round(2)[[predictor]].sort_values(predictor, ascending=False))

## Plot Actual vs Predicted

In [14]:
# Plot the actual vs predicted values
def plot_actual_vs_pred(df, predictor):
    sns.lmplot(x = predictor + "_Actual", y = predictor + "_Pred", data=df, fit_reg=False, size=7)

In [15]:
# Plot the actual vs ranked predicted values
def plot_actual_vs_pred_ranked(df, predictor):
    sns.lmplot(x = predictor + "_Actual", y = predictor + "_Pred_Ranked", data=df, fit_reg=False, size=7)

## TensorFlow (Keras)

In [16]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [17]:
def KerasRegressionModel(train, predictor, features, test_year):
    # Split train and test data
    test = train[train["Year"]==test_year]
    train = train[train["Year"]<test_year]
    # Alternative
    #train, test = train_test_split(train)
    
    # Remove redundant columns
    del train["Year"]
    del test["Year"]
    del train["Rider"]
    
    if predictor == "GC":
        # Remove DNF values from training data
        train = train[train[predictor]!="DNF"]
        train.dropna(subset=[predictor],inplace=True)
        train[predictor] = train[predictor].astype(str).str[:-2].astype(int)

    # Normalize Train Data
    train =  NormalizeData(train)
    # Normalize Test data
    test = NormalizeData(test)
    
    if predictor == "GC":
        # Replace DNF values with 0
        test["GC"].fillna('0.0',inplace=True)
        test["GC"].replace("DNF",'0.0',inplace=True)
        test[predictor] = test[predictor].astype(str).str[:-2].astype(int)
    
    get_stats(train, predictor, features)
    calc_vif(train, features)
    heatmap(train, predictor, features)

    model = keras.Sequential([
    keras.layers.Dense(32, activation=tf.nn.relu, input_shape=[1]),
    keras.layers.Dense(32, activation=tf.nn.relu),
    keras.layers.Dense(32, activation=tf.nn.relu),
    keras.layers.Dense(1)])

    optimizer = tf.keras.optimizers.RMSprop(0.0099)
    model.compile(loss='mean_squared_error',optimizer="adam")
    model.fit(train[features], train[predictor],epochs=500)
   
    return test

In [18]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [19]:
def tf_Top10(train, features, test_year):
    
    # Add Top10
    train["GC_temp"] = train["GC"]
    train.dropna(subset=["GC_temp"], inplace=True)
    train["GC_temp"].replace("DNF",200.0,inplace=True)
    train["GC_temp"] = train["GC_temp"].astype(float).astype(int)
    train["Top10"] = 0
    train["Top10"] = np.where(train["GC_temp"]>10, 0, 1)
    del tdf_train["GC_temp"]
    
    # Split train and test data
    test = train[train["Year"]==test_year]
    train = train[train["Year"]<test_year]
    # Alternative
    #train, test = train_test_split(train)
    
    # Remove redundant columns
    del train["Year"]
    del test["Year"]
    del train["Rider"]
    
    display(train[["GC","Top10"]].head())
    
    model = Sequential()
    model.add(Dense(units=32, activation='relu', input_dim=len(features)))
    model.add(Dense(units=64, activation='relu'))
    model.add(Dense(units=1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics='accuracy')
    
    model.fit(train[features], train["Top10"], epochs=200, batch_size=32)
    
    test["Prediction"] = model.predict(test[features])
    #test["Prediction"] = [0 if val < 0.5 else 1 for val in test["Prediction"]]
    
    #print(accuracy_score(test["Top10"], test["Prediction"]))
    
    return test

## TensorFlow Linear Regression

In [20]:
def tf_linpredictor(train, predictor, features, test_year, split, epochs, batch_size):

    cols = features.copy()
    cols.append(predictor)
    cols.append("Rider")
    cols.append("Year")
    train = train[cols]
        
    if split == "year":
        # Split train and test data
        test = train[train["Year"]==test_year]
        train = train[train["Year"]<test_year]
        del test["Year"]
        
    elif split == "sk":
        # Alternative
        train = train[train["Year"]!=2021]
        train, test = train_test_split(train, test_size=0.2)
    else:
        print("Invalid Split Name")
        return

    # Remove redundant columns
    del train["Year"]
    del train["Rider"]
    
    #Remove DNFs and Nulls from GC
    if predictor == "GC":
        # Replace DNF values with 0
        train["GC"].fillna('0.0',inplace=True)
        train["GC"].replace("DNF",'0.0',inplace=True)
        train["GC"].replace("",'0.0',inplace=True)
        train[predictor] = train[predictor].astype(float).astype(int)
        train = train[train["GC"]!=0]

    # Normalize Train Data
    train =  NormalizeData(train)
    # Normalize Test data
    test = NormalizeData(test)
        
    model = Sequential()
    model.add(Dense(units=256, kernel_initializer='normal', activation='relu', input_dim=len(features)))
    model.add(Dense(units=16, kernel_initializer='normal', activation='relu'))
    model.add(Dense(units=8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='relu'))

    model.compile(loss='mean_squared_error',optimizer="adam", metrics='accuracy')
    
    model.fit(train[features], train[predictor],epochs=epochs, batch_size=batch_size)
    
    test["Prediction"] = model.predict(test[features])
    
    if split == "year":
        # Rank in order
        test["Prediction_Ranked"] = test["Prediction"].rank().astype(int)
        
        # Display Output
        display(test[["Rider",predictor,"Prediction_Ranked","Prediction"]].sort_values(by="Prediction").head(30))
        
        y_true = test[predictor]
        y_pred = test["Prediction"]

        print("Checking accuracy scores for Prediction:")

        print("Explained variance regression score: " + str(metrics.explained_variance_score(y_true, y_pred)))
        print("Maximum residual error: " + str(metrics.max_error(y_true, y_pred)))   
        print("Mean absolute error regression loss: " + str(metrics.mean_absolute_error(y_true, y_pred)))    
        print("Mean squared error regression loss: " + str(metrics.mean_squared_error(y_true, y_pred)))    
        print("Mean squared logarithmic error regression loss." + str(metrics.mean_squared_log_error(y_true, y_pred)))   
        print("Median absolute error regression loss: " + str(metrics.median_absolute_error(y_true, y_pred)))   
        print("Regression score function (R^2): " + str(metrics.r2_score(y_true, y_pred)))  
        print("Mean Poisson deviance regression loss: " + str(metrics.mean_poisson_deviance(y_true, y_pred)))
        print("Mean Tweedie deviance regression loss: " + str(metrics.mean_tweedie_deviance(y_true, y_pred)))
        
        y_pred = test["Prediction_Ranked"]
        print("\n")
        print("Checking accuracy scores for Prediction Ranked:")

        print("Explained variance regression score: " + str(metrics.explained_variance_score(y_true, y_pred)))
        print("Maximum residual error: " + str(metrics.max_error(y_true, y_pred)))   
        print("Mean absolute error regression loss: " + str(metrics.mean_absolute_error(y_true, y_pred)))    
        print("Mean squared error regression loss: " + str(metrics.mean_squared_error(y_true, y_pred)))    
        print("Mean squared logarithmic error regression loss." + str(metrics.mean_squared_log_error(y_true, y_pred)))   
        print("Median absolute error regression loss: " + str(metrics.median_absolute_error(y_true, y_pred)))   
        print("Regression score function (R^2): " + str(metrics.r2_score(y_true, y_pred)))  
        print("Mean Poisson deviance regression loss: " + str(metrics.mean_poisson_deviance(y_true, y_pred)))
        print("Mean Tweedie deviance regression loss: " + str(metrics.mean_tweedie_deviance(y_true, y_pred)))


    elif split == "sk":       
        # Display Output
        display(test[["Rider","Year","GC","Prediction"]].sort_values(by="Prediction").head(30))
        
        y_true = test[predictor]
        y_pred = test["Prediction"]

        print("Checking accuracy scores for Prediction:")

        print("Explained variance regression score: " + str(metrics.explained_variance_score(y_true, y_pred)))
        print("Maximum residual error: " + str(metrics.max_error(y_true, y_pred)))   
        print("Mean absolute error regression loss: " + str(metrics.mean_absolute_error(y_true, y_pred)))    
        print("Mean squared error regression loss: " + str(metrics.mean_squared_error(y_true, y_pred)))    
        print("Mean squared logarithmic error regression loss." + str(metrics.mean_squared_log_error(y_true, y_pred)))   
        print("Median absolute error regression loss: " + str(metrics.median_absolute_error(y_true, y_pred)))   
        print("Regression score function (R^2): " + str(metrics.r2_score(y_true, y_pred)))  
        print("Mean Poisson deviance regression loss: " + str(metrics.mean_poisson_deviance(y_true, y_pred)))
        print("Mean Tweedie deviance regression loss: " + str(metrics.mean_tweedie_deviance(y_true, y_pred)))
        
    return test

In [21]:
def tf_linpredictor_huber(train, predictor, features, test_year, split, epochs, batch_size):

    cols = features.copy()
    cols.append(predictor)
    cols.append("Rider")
    cols.append("Year")
    train = train[cols]
        
    if split == "year":
        # Split train and test data
        test = train[train["Year"]==test_year]
        train = train[train["Year"]<test_year]
        del test["Year"]
        
    elif split == "sk":
        # Alternative
        train = train[train["Year"]!=2021]
        train, test = train_test_split(train, test_size=0.2)
    else:
        print("Invalid Split Name")
        return

    # Remove redundant columns
    del train["Year"]
    del train["Rider"]
    
    #Remove DNFs and Nulls from GC
    if predictor == "GC":
        # Replace DNF values with 0
        train["GC"].fillna('0.0',inplace=True)
        train["GC"].replace("DNF",'0.0',inplace=True)
        train["GC"].replace("",'0.0',inplace=True)
        train[predictor] = train[predictor].astype(float).astype(int)
        train = train[train["GC"]!=0]

    # Normalize Train Data
    train =  NormalizeData(train)
    # Normalize Test data
    test = NormalizeData(test)
        
    model = Sequential()
    model.add(Dense(units=256, kernel_initializer='normal', activation='relu', input_dim=len(features)))
    model.add(Dense(units=16, kernel_initializer='normal', activation='relu'))
    model.add(Dense(units=8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='relu'))

    model.compile(loss='huber',optimizer="adam", metrics='accuracy')
    
    model.fit(train[features], train[predictor],epochs=epochs, batch_size=batch_size)
    
    test["Prediction"] = model.predict(test[features])
    
    if split == "year":
        # Rank in order
        test["Prediction_Ranked"] = test["Prediction"].rank().astype(int)
        
        # Display Output
        display(test[["Rider",predictor,"Prediction_Ranked","Prediction"]].sort_values(by="Prediction").head(30))
        
        y_true = test[predictor]
        y_pred = test["Prediction"]

        print("Checking accuracy scores for Prediction:")

        print("Explained variance regression score: " + str(metrics.explained_variance_score(y_true, y_pred)))
        print("Maximum residual error: " + str(metrics.max_error(y_true, y_pred)))   
        print("Mean absolute error regression loss: " + str(metrics.mean_absolute_error(y_true, y_pred)))    
        print("Mean squared error regression loss: " + str(metrics.mean_squared_error(y_true, y_pred)))    
        print("Mean squared logarithmic error regression loss." + str(metrics.mean_squared_log_error(y_true, y_pred)))   
        print("Median absolute error regression loss: " + str(metrics.median_absolute_error(y_true, y_pred)))   
        print("Regression score function (R^2): " + str(metrics.r2_score(y_true, y_pred)))  
        #print("Mean Poisson deviance regression loss: " + str(metrics.mean_poisson_deviance(y_true, y_pred)))
        #print("Mean Tweedie deviance regression loss: " + str(metrics.mean_tweedie_deviance(y_true, y_pred)))
        
        y_pred = test["Prediction_Ranked"]
        print("\n")
        print("Checking accuracy scores for Prediction Ranked:")

        print("Explained variance regression score: " + str(metrics.explained_variance_score(y_true, y_pred)))
        print("Maximum residual error: " + str(metrics.max_error(y_true, y_pred)))   
        print("Mean absolute error regression loss: " + str(metrics.mean_absolute_error(y_true, y_pred)))    
        print("Mean squared error regression loss: " + str(metrics.mean_squared_error(y_true, y_pred)))    
        print("Mean squared logarithmic error regression loss." + str(metrics.mean_squared_log_error(y_true, y_pred)))   
        print("Median absolute error regression loss: " + str(metrics.median_absolute_error(y_true, y_pred)))   
        print("Regression score function (R^2): " + str(metrics.r2_score(y_true, y_pred)))  
        #print("Mean Poisson deviance regression loss: " + str(metrics.mean_poisson_deviance(y_true, y_pred)))
        #print("Mean Tweedie deviance regression loss: " + str(metrics.mean_tweedie_deviance(y_true, y_pred)))


    elif split == "sk":       
        # Display Output
        display(test[["Rider","Year","GC","Prediction"]].sort_values(by="Prediction").head(30))
        
        y_true = test[predictor]
        y_pred = test["Prediction"]

        print("Checking accuracy scores for Prediction:")

        print("Explained variance regression score: " + str(metrics.explained_variance_score(y_true, y_pred)))
        print("Maximum residual error: " + str(metrics.max_error(y_true, y_pred)))   
        print("Mean absolute error regression loss: " + str(metrics.mean_absolute_error(y_true, y_pred)))    
        print("Mean squared error regression loss: " + str(metrics.mean_squared_error(y_true, y_pred)))    
        print("Mean squared logarithmic error regression loss." + str(metrics.mean_squared_log_error(y_true, y_pred)))   
        print("Median absolute error regression loss: " + str(metrics.median_absolute_error(y_true, y_pred)))   
        print("Regression score function (R^2): " + str(metrics.r2_score(y_true, y_pred)))  
        #print("Mean Poisson deviance regression loss: " + str(metrics.mean_poisson_deviance(y_true, y_pred)))
        #print("Mean Tweedie deviance regression loss: " + str(metrics.mean_tweedie_deviance(y_true, y_pred)))
        
    return test

# Read and Clean Data

In [22]:
tdf_train = pd.read_csv(dPath+"tdf_train_2007_2020"+".csv")

In [23]:
#tdf_2021 = pd.read_csv(dPath+"tdf_2021"+".csv")

In [24]:
#tdf_2021 = clean_2021_data(tdf_2021)

In [25]:
#tdf_train = pd.concat([tdf_train, tdf_2021], ignore_index=True)

In [26]:
teams = pd.read_csv(dPath+"Teams.csv", engine="python")

In [27]:
#tdf_train.isna().sum()

In [28]:
tdf_train = clean_train_data(tdf_train)

In [29]:
tdf_train = tdf_train.replace('AttError','Unknown', regex=True)

In [30]:
tdf_train = tdf_train[~tdf_train["Days since last race"].isna()]

# Select Model Variables

In [31]:
# Print all possible feature variables
all_cols = tdf_train.columns.to_list()
all_cols = [e for e in all_cols if e not in ('Rider','Year','GC','Seconds Behind Winner','Top10','Team','URL','Nationality')]
print(all_cols)

['Age', 'Team Leader', 'Days since last race', 'AverageResult_2UWT', 'AverageResult_2HC', 'BestResult_2UWT', 'BestResult_2HC', 'Total_KMs', 'UCIPoints', 'StageRaces', 'OneDayRaces', 'StageRaceDNFs', 'OneDayDNFs', 'AverageResult_2Other', 'BestResult_2Other', 'Height', 'Weight', 'numTours', 'numGiros', 'numVueltas', 'BestGTFinish', 'BestTourFinish', 'BestGTStageFinish', 'FirstGT', 'LastGT', 'FirstTour', 'LastTour', 'NumGTDNFs', 'numGTs', 'YearsSinceFirstGT', 'YearsSinceLastGT', 'Giro', 'WT', 'Age_Team', 'numGTs_Team', 'BestGTFinish_Team']


In [32]:
features1 = ['Age', 'BestResult_2UWT',
              'Total_KMs', 'UCIPoints', 'BestGTFinish', 'NumStageRaces',
              'NumDNFs', 'numGTs', 'YearsSinceLastGT', 'Giro', 'BestGTFinish_Team']

In [33]:
features2 = ['Age', 'BestResult_2UWT', 'StageRaces', 'OneDayRaces', 'StageRaceDNFs',
              'Total_KMs', 'BestGTFinish', 'WT', 'Giro',
              'numGTs', 'Days since last race']

In [34]:
features3 = ['Age', 'Team Leader', 'BestResult_2UWT',
              'Total_KMs', 'UCIPoints', 'StageRaces', 'OneDayRaces', 'StageRaceDNFs', 'OneDayDNFs', 'BestGTFinish',  
              'NumGTDNFs', 'numGTs', 'YearsSinceFirstGT',
              'YearsSinceLastGT', 'Giro', 'numGTs_Team', 'BestGTFinish_Team','Days since last race']

In [35]:
features4 = ['Age', 'Days since last race', 'BestResult_2UWT', 
             'Total_KMs', 'UCIPoints', 'StageRaceDNFs','Team Leader','YearsSinceFirstGT',
             'BestGTFinish', 'NumGTDNFs', 'numGTs', 'YearsSinceLastGT', 'Giro', 'WT', 'Age_Team','BestGTFinish_Team']

# Run Models

In [36]:
tf_linearModel = tf_linpredictor(tdf_train, "GC", features4, 2020, "year", 200, 32)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/20

Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


Unnamed: 0,Rider,GC,Prediction_Ranked,Prediction
0,POGAČAR TADEJ,1.0,1,0.32632
2,YATES ADAM,9.0,2,0.585989
0,PINOT THIBAUT,29.0,3,3.180756
0,LÓPEZ MIGUEL ÁNGEL,6.0,4,5.654111
0,QUINTANA NAIRO,17.0,5,6.986652
0,DUMOULIN TOM,7.0,6,10.653196
0,MARTIN GUILLAUME,11.0,7,11.258402
0,LANDA MIKEL,4.0,8,12.273592
1,BENNETT GEORGE,34.0,9,13.09936
1,PORTE RICHIE,3.0,10,14.95322


Checking accuracy scores for Prediction:
Explained variance regression score: 0.6311173759151798
Maximum residual error: 72.50726318359375
Mean absolute error regression loss: 21.71796233201849
Mean squared error regression loss: 792.3015977047538
Mean squared logarithmic error regression loss.0.35255442527187997
Median absolute error regression loss: 16.211471557617188
Regression score function (R^2): 0.5555002735141126
Mean Poisson deviance regression loss: 12.350045172206014
Mean Tweedie deviance regression loss: 792.3015977047538


Checking accuracy scores for Prediction Ranked:
Explained variance regression score: 0.6730191324851994
Maximum residual error: 79.0
Mean absolute error regression loss: 18.03448275862069
Mean squared error regression loss: 582.9172413793103
Mean squared logarithmic error regression loss.0.30972989952208235
Median absolute error regression loss: 14.0
Regression score function (R^2): 0.6729697944474349
Mean Poisson deviance regression loss: 11.45673581568

In [37]:
tf_linearModel = tf_linpredictor_huber(tdf_train, "GC", features4, 2020, "year", 200, 32)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/2

Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


Unnamed: 0,Rider,GC,Prediction_Ranked,Prediction
2,YATES ADAM,9.0,1,1.617378
0,POGAČAR TADEJ,1.0,2,3.302804
0,PINOT THIBAUT,29.0,3,5.2305
0,LÓPEZ MIGUEL ÁNGEL,6.0,4,5.29742
0,QUINTANA NAIRO,17.0,5,9.605963
0,LANDA MIKEL,4.0,6,11.273294
0,MARTIN GUILLAUME,11.0,7,13.49766
0,DUMOULIN TOM,7.0,8,14.430902
0,BENOOT TIESJ,75.0,9,16.092979
1,PORTE RICHIE,3.0,10,17.501684


Checking accuracy scores for Prediction:
Explained variance regression score: 0.5879409013666645
Maximum residual error: 79.81800842285156
Mean absolute error regression loss: 22.767140416441293
Mean squared error regression loss: 878.8338111877466
Mean squared logarithmic error regression loss.0.33577502802391
Median absolute error regression loss: 16.115066528320312
Regression score function (R^2): 0.5069536779539936


Checking accuracy scores for Prediction Ranked:
Explained variance regression score: 0.6651880198171872
Maximum residual error: 76.0
Mean absolute error regression loss: 18.24137931034483
Mean squared error regression loss: 596.8758620689655
Mean squared logarithmic error regression loss.0.3238248432843562
Median absolute error regression loss: 14.0
Regression score function (R^2): 0.6651386817794227


In [38]:
1/0

ZeroDivisionError: division by zero

In [None]:
tf_linearModel[["Rider","Prediction_Ranked","Seconds Behind Winner"]].sort_values(by="Seconds Behind Winner").head(30)

In [None]:
ridgemodel.to_csv(dPath+"tdf_2019_prediction.csv", index=False)

In [None]:
linearmodel = FitLinearRegressionModel(tdf_train, "GC", features1, 2021)

In [None]:
1/0

In [None]:
def scatter_plot_1(df, dim1, dim2):
    # creating trace1
    trace1 = go.Scatter(
                        x = df[dim1],
                        y = df[dim2],
                        mode = "markers",
                        name = "2014",
                        marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                        text= tdf_train_fin["Rider"])

    data = [trace1]
    layout = dict(title = dim1 + 'vs' + dim2,
                  xaxis= dict(title= dim1,ticklen= 5,zeroline= False),
                  yaxis= dict(title= dim2,ticklen= 5,zeroline= False)
                 )
    fig = dict(data = data, layout = layout)
    iplot(fig)