In [63]:
import pandas as pd
import numpy as np
import os

# Elo
Run Elo over a data set.

In [71]:
start_year = 1968 # 1968
end_year = 2018  # The Kovalchik paper ends in 2014.
base_dir = r"C:/Users/Paul/tennis_atp"
filename_label = r"atp_matches_%d.csv"

def process_year_df(df_year):
    df = df_year.loc[:, ['tourney_name', 'tourney_level', 'tourney_date', 'match_num', 'score', 'round', 'winner_id', 'winner_name', 'loser_id', 'loser_name']]
    # Put in chronological order.
    df = df.sort_values(['tourney_date', 'tourney_name', 'match_num'], ascending=[True, True, True])
    return df

In [72]:
# Read the match data.
df = None
for year in range(start_year, end_year + 1):
    filename = os.path.join(base_dir, filename_label % year)
    df_year = pd.read_csv(filename)
    df_year = process_year_df(df_year)
#     df_year = df_year.loc[df_year['tourney_name'] == 'Montpellier']
#     print(filename)
#     print(df_year[0:300])
    
    print("%s: " % filename, df_year.shape)
    
    if df is None:
        df = df_year
    else:
        df = pd.concat([df, df_year])

C:/Users/Paul/tennis_atp\atp_matches_1968.csv:  (1232, 10)
C:/Users/Paul/tennis_atp\atp_matches_1969.csv:  (1451, 10)
C:/Users/Paul/tennis_atp\atp_matches_1970.csv:  (1681, 10)
C:/Users/Paul/tennis_atp\atp_matches_1971.csv:  (2712, 10)
C:/Users/Paul/tennis_atp\atp_matches_1972.csv:  (2934, 10)
C:/Users/Paul/tennis_atp\atp_matches_1973.csv:  (3837, 10)
C:/Users/Paul/tennis_atp\atp_matches_1974.csv:  (3899, 10)
C:/Users/Paul/tennis_atp\atp_matches_1975.csv:  (4038, 10)
C:/Users/Paul/tennis_atp\atp_matches_1976.csv:  (3871, 10)
C:/Users/Paul/tennis_atp\atp_matches_1977.csv:  (4046, 10)
C:/Users/Paul/tennis_atp\atp_matches_1978.csv:  (3851, 10)
C:/Users/Paul/tennis_atp\atp_matches_1979.csv:  (4003, 10)
C:/Users/Paul/tennis_atp\atp_matches_1980.csv:  (3955, 10)
C:/Users/Paul/tennis_atp\atp_matches_1981.csv:  (3835, 10)
C:/Users/Paul/tennis_atp\atp_matches_1982.csv:  (4105, 10)
C:/Users/Paul/tennis_atp\atp_matches_1983.csv:  (3557, 10)
C:/Users/Paul/tennis_atp\atp_matches_1984.csv:  (3222, 1

In [73]:
print(df.tourney_level.unique())

['G' 'A' 'D' 'M' 'F' 'C']


In [74]:
# Run Elo through the data.
# http://www.eloratings.net/about
# https://www.betfair.com.au/hub/better-betting/betting-strategies/tennis/tennis-elo-modelling/
# http://vuir.vu.edu.au/34652/1/jqas-2015-0059.pdf

start_elo = 1500
grand_slam_multiplier = 1.1
use_fte_K = True
remove_walkovers = True
remove_retirements = True
tourney_levels = ['G', 'M', 'A']

if use_fte_K:
    def get_K(n_matches):
        return 250 / np.power(n_matches + 5, 0.4)
else:
    def get_K(n_matches): 
        return 20

from collections import defaultdict
# The Elo rating at any point in time.
elo_ratings = defaultdict(lambda: start_elo)
# The number of matches played by each player, at any point in time.
n_matches_played = defaultdict(lambda: 1)
# The latest tournament date.
latest_tourney_date = defaultdict(lambda: 0)

# Perform predictions on the final year.
prediction_year = end_year
predicted_winner_probabilities = {}
n_predictions = 0
n_correct_predictions = 0
n_grand_slams = 0
n_masters = 0

for index, row in df.iterrows():

    if row['tourney_level'] not in tourney_levels:
        continue
    
    if remove_walkovers:
        if pd.isna(row['score']) or 'W/O' in row['score']:
            # This match wasn't played.
            continue
          
    winner = row['winner_name']
    loser = row['loser_name']
    
    n_matches_played[winner] += 1
    n_matches_played[loser] += 1
    
    latest_tourney_date[winner] = row['tourney_date']
    latest_tourney_date[loser] = row['tourney_date']
    
    if remove_retirements:
        if 'RET' in row['score'] or 'ABD' in row['score'] or 'unfinished' in row['score'] or 'DEF' in row['score']:
            # Don't update the ratings for this match.
            continue
            
#     print('-', row['tourney_date'], row['score'], '-')
    
    # The pre-match ratings.
    winner_rating = elo_ratings[winner]
    loser_rating = elo_ratings[loser]
        
    # The pre-match win expectancy.
    rating_diff = winner_rating - loser_rating
    winner_win_expectancy = 1 / (10 ** (-rating_diff / 400) + 1)
    loser_win_expectancy = 1 - winner_win_expectancy
    
    # The post-match ratings.
    multiplier = 1.0
    if row['tourney_level'] == 'G':
        multiplier = grand_slam_multiplier
    K_winner = get_K(n_matches_played[winner]) * multiplier
    K_loser = get_K(n_matches_played[loser]) * multiplier
    new_winner_rating = winner_rating + K_winner * (1 - winner_win_expectancy)
    new_loser_rating = loser_rating + K_loser * (0 - loser_win_expectancy)
    
    current_year = int(str(row['tourney_date'])[0:4])
    if current_year == prediction_year:
        # Record the prediction for this year.
        n_predictions += 1
        if winner_win_expectancy > 0.5:
            n_correct_predictions += 1
        if row['tourney_level'] == 'G':
            n_grand_slams += 1
        if row['tourney_level'] == 'M':
            n_masters += 1
    
    elo_ratings[winner] = new_winner_rating
    elo_ratings[loser] = new_loser_rating
    

In [75]:
accuracy = n_correct_predictions / n_predictions
print("Predicted %d / %d = %.3f correct predictions" % (n_correct_predictions, n_predictions, accuracy))
print("%d grand slams, %d masters:" % (n_grand_slams, n_masters))

Predicted 762 / 1199 = 0.636 correct predictions
123 grand slams, 347 masters:


In [77]:
# Remove players who haven't played since the start of the previous year.
cutoff_date = int(str(end_year - 1) + "0000")
print(cutoff_date)
for p in latest_tourney_date.keys():
    if latest_tourney_date[p] < cutoff_date:
        try:
            del elo_ratings[p]
#             print("Removing player %s, latest date %s" % (p, latest_tourney_date[p]))
        except:
            pass

df_elo_ratings = pd.DataFrame.from_dict(elo_ratings, orient='index')
df_elo_ratings.columns = ['rating']
df_elo_ratings = df_elo_ratings.sort_values(['rating'], ascending=[False])
df_elo_ratings.head

20170000


<bound method NDFrame.head of                                   rating
Roger Federer                2331.367268
Rafael Nadal                 2303.170820
Novak Djokovic               2295.163544
Andy Murray                  2285.677671
Juan Martin Del Potro        2174.431072
Alexander Zverev             2159.239377
Kei Nishikori                2133.182952
Nick Kyrgios                 2086.791775
Jo Wilfried Tsonga           2063.618241
Milos Raonic                 2060.191096
Dominic Thiem                2058.397403
Grigor Dimitrov              2050.180493
David Goffin                 2035.444391
Stanislas Wawrinka           2034.865399
Marin Cilic                  2023.149848
Tomas Berdych                2001.860219
Kevin Anderson               1991.954081
Borna Coric                  1974.660830
Roberto Bautista Agut        1969.074225
Gael Monfils                 1966.598511
Kyle Edmund                  1965.027031
Hyeon Chung                  1964.850076
Denis Shapovalov           

This seems pretty good... it might not match the 538 or tennisabstract ratings, but at least the top players come out on top.

To-do:
* Maybe filter so that only currently players are displayed.
* Add the time-off penalty: http://www.tennisabstract.com/blog/2018/05/15/handling-injuries-and-absences-with-tennis-elo/