# Tennis Elo Rating

Import standard packages

In [2]:
import pandas as pd
import numpy as np

In [3]:
utr_train = pd.read_csv('utr_train_UPDATED.csv')
utr_test = pd.read_csv('utr_test_UPDATED.csv')

In [4]:
utr_train.head()

Unnamed: 0,resultid,resultmonth,winnerid,loserid,winnerset1,winnerset2,winnerset3,winnerset4,winnerset5,loserset1,loserset2,loserset3,loserset4,loserset5
0,1,2019-05,57529,3765,7,7,0,0,0,6,5,0,0,0
1,2,2019-03,83218,3871,6,6,0,0,0,4,2,0,0,0
2,4,2019-11,4021,4487,7,5,7,0,0,5,7,5,0,0
3,5,2019-10,1984892,411593,6,6,0,0,0,3,3,0,0,0
4,7,2019-09,52294,224678,6,6,0,0,0,3,2,0,0,0


Sort matches chronologically and drop undeeded columns

In [5]:
#utr_train = utr_train[['resultmonth','winnerid','loserid']]
utr_train = utr_train.sort_values('resultmonth')
utr_train = utr_train.drop(columns=['resultmonth'])
utr_train

Unnamed: 0,resultid,winnerid,loserid,winnerset1,winnerset2,winnerset3,winnerset4,winnerset5,loserset1,loserset2,loserset3,loserset4,loserset5
420,578,11063,45027,6,6,0,0,0,1,0,0,0,0
2818,4026,847617,10788,1,6,6,0,0,6,3,4,0,0
2816,4022,11076,2555950,6,6,0,0,0,3,2,0,0,0
1593,2249,58235,79636,6,6,0,0,0,2,3,0,0,0
1589,2244,3498,83218,4,6,6,0,0,6,4,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,382,86294,233912,6,6,0,0,0,3,3,0,0,0
3886,5591,238409,153449,4,6,6,0,0,6,3,1,0,0
3903,5613,50083,191924,2,6,6,0,0,6,2,2,0,0
2668,3816,11640,42381,1,6,6,0,0,6,1,2,0,0


Create **player_list** variable that will keep track of/update a players elo after each inputed match.

Note: I assume everyone starts off with the same elo at 1500

In [6]:
player_list = (utr_train['winnerid'].append(utr_train['loserid'])).unique()

player_df = pd.DataFrame(player_list, columns=['id'])
player_df = player_df.assign(elo = 1500.0)

In [7]:
player_df

Unnamed: 0,id,elo
0,11063,1500.0
1,847617,1500.0
2,11076,1500.0
3,58235,1500.0
4,3498,1500.0
...,...,...
1141,34161,1500.0
1142,217710,1500.0
1143,25603,1500.0
1144,90999,1500.0


In [8]:
import math 

# Formula to calculate prob of Player 1 winning against player 2
# Derived from Arpad Elo chess algorithm: https://en.wikipedia.org/wiki/Elo_rating_system

def expectedValue(rating1, rating2): 
  
    return(1.0/(1.0 + 10.0 ** ((rating2 - rating1) / 400)))
  
    
  
 # Function to calculate/update Elo rating of both players
# K is a constant that changes degree of elo gain/loss per matchup
# d determines whether Player 1 or Player 2 wins 

def EloRating2(R1, R2, K=32): 
   
  
    # To calculate the Winning 
    # Probability of Player 2 
    E1 = expectedValue(R1, R2) 
  
    # To calculate the Winning 
    # Probability of Player 1 
    E2 = expectedValue(R2, R1) 
  
    # Updating the Elo Ratings 
    R1 = R1 + K * (1 - E1) 
    R2 = R2 + K * (0 - E2)       
   
    return(R1,R2) 
  
    
# Test:      
# Ra and Rb are current Elo ratings 
Ra = 1200
Rb = 1000
EloRating2(Ra, Rb) 
  



(1207.6880983472654, 992.3119016527346)

Large forloop through **the utr_train** dataframe
    -Compute each match from training set & update each players Elo in **player_df** dataframe after each match

In [9]:
for i in range(0,len(utr_train)):
    m_winner = utr_train.iloc[i,1]
    m_loser = utr_train.iloc[i,2]
    
    winner_df_index = player_df.id[player_df.id == m_winner].index[0]
    loser_df_index = player_df.id[player_df.id == m_loser].index[0]
    
    winner_rating = player_df.iloc[winner_df_index,1]
    loser_rating = player_df.iloc[loser_df_index,1]
    

    new_winner_rank, new_loser_rank = EloRating2(winner_rating, loser_rating)
    
    #print(m_winner, m_loser)
    #print(winner_rating, loser_rating)
    #print(new_winner_rank, new_loser_rank)
    #print('\n')
    
    # Update the rankings in player_df after each match
    
    player_df.at[winner_df_index,'elo'] = new_winner_rank
    player_df.at[loser_df_index,'elo'] = new_loser_rank


    

In [10]:
player_df

Unnamed: 0,id,elo
0,11063,1529.688787
1,847617,1468.706256
2,11076,1588.469766
3,58235,1548.220625
4,3498,1499.984660
...,...,...
1141,34161,1483.858080
1142,217710,1482.699653
1143,25603,1483.366618
1144,90999,1483.112875


## Calculating Elo Prob

In [11]:
# Adding a prob column to player df to test
utr_train = utr_train.assign(prob = 0.0)
utr_train = utr_train.reset_index()
utr_train

Unnamed: 0,index,resultid,winnerid,loserid,winnerset1,winnerset2,winnerset3,winnerset4,winnerset5,loserset1,loserset2,loserset3,loserset4,loserset5,prob
0,420,578,11063,45027,6,6,0,0,0,1,0,0,0,0,0.0
1,2818,4026,847617,10788,1,6,6,0,0,6,3,4,0,0,0.0
2,2816,4022,11076,2555950,6,6,0,0,0,3,2,0,0,0,0.0
3,1593,2249,58235,79636,6,6,0,0,0,2,3,0,0,0,0.0
4,1589,2244,3498,83218,4,6,6,0,0,6,4,4,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4547,264,382,86294,233912,6,6,0,0,0,3,3,0,0,0,0.0
4548,3886,5591,238409,153449,4,6,6,0,0,6,3,1,0,0,0.0
4549,3903,5613,50083,191924,2,6,6,0,0,6,2,2,0,0,0.0
4550,2668,3816,11640,42381,1,6,6,0,0,6,1,2,0,0,0.0


In [12]:
for i in range(0,len(utr_train)):
    m_winner = utr_train.iloc[i,2]
    m_loser = utr_train.iloc[i,3]
    
    winner_df_index = player_df.id[player_df.id == m_winner].index[0]
    loser_df_index = player_df.id[player_df.id == m_loser].index[0]
    
    winner_rating = player_df.iloc[winner_df_index,1]
    loser_rating = player_df.iloc[loser_df_index,1]
    
    current_prob = expectedValue(winner_rating, loser_rating)
    utr_train.at[i,'prob'] = current_prob
    
    #print(m_winner,m_loser)
    #print(winner_df_index,loser_df_index)
    #print(winner_rating,loser_rating)
    #print(current_prob)
    #print('\n')


In [13]:
utr_train[:25]

Unnamed: 0,index,resultid,winnerid,loserid,winnerset1,winnerset2,winnerset3,winnerset4,winnerset5,loserset1,loserset2,loserset3,loserset4,loserset5,prob
0,420,578,11063,45027,6,6,0,0,0,1,0,0,0,0,0.424898
1,2818,4026,847617,10788,1,6,6,0,0,6,3,4,0,0,0.50728
2,2816,4022,11076,2555950,6,6,0,0,0,3,2,0,0,0,0.616359
3,1593,2249,58235,79636,6,6,0,0,0,2,3,0,0,0,0.564893
4,1589,2244,3498,83218,4,6,6,0,0,6,4,4,0,0,0.295505
5,2082,2954,3691,3469,7,7,0,0,0,5,5,0,0,0,0.441408
6,3814,5486,3564,224142,6,7,6,0,0,4,5,1,0,0,0.510233
7,2083,2955,52337,51245,7,7,0,0,0,5,6,0,0,0,0.48053
8,3830,5505,3832,52294,6,6,7,0,0,1,1,6,0,0,0.462541
9,3832,5508,10788,1609206,6,5,7,0,0,2,7,5,0,0,0.509381


In [14]:
utr_test

Unnamed: 0,resultid,resultmonth,player1,player2,player1winprobability
0,3,2019-02,4372,3464,
1,6,2019-02,3671,3507,
2,8,2019-08,4584,53374,
3,9,2019-08,10759,10751,
4,13,2019-10,405461,1194009,
...,...,...,...,...,...
1839,6524,2019-11,3654,87175,
1840,6537,2019-11,79558,54130,
1841,6543,2019-02,1517181,4031,
1842,6545,2019-08,10751,10274,


Forloop that will calculate the player 1 win probabilites in the test dataset.
Probabilities are calculated based on elo ratings of the two players that were obtained after looping 
through training set in chronological order. Elos are grabbed from **player_df** by using player id as foreign key reference

In [15]:
for i in range(0,len(utr_test)):
    m_winner = utr_test.iloc[i,2]
    m_loser = utr_test.iloc[i,3]
    
    winner_df_index = player_df.id[player_df.id == m_winner].index[0]
    loser_df_index = player_df.id[player_df.id == m_loser].index[0]
    
    winner_rating = player_df.iloc[winner_df_index,1]
    loser_rating = player_df.iloc[loser_df_index,1]
    
    current_prob = expectedValue(winner_rating, loser_rating)
    utr_test.at[i,'player1winprobability'] = current_prob

In [16]:
utr_test[:25]

Unnamed: 0,resultid,resultmonth,player1,player2,player1winprobability
0,3,2019-02,4372,3464,0.54817
1,6,2019-02,3671,3507,0.452522
2,8,2019-08,4584,53374,0.293999
3,9,2019-08,10759,10751,0.66723
4,13,2019-10,405461,1194009,0.557416
5,16,2019-10,52346,77442,0.512415
6,18,2019-03,10378,51673,0.73992
7,19,2019-11,3850,54846,0.634702
8,20,2019-07,207037,95506,0.473536
9,26,2019-09,10244,10839,0.462594


In [17]:
utr_test.to_csv(r'OutputPredict.csv', index=False)